From f93281d86862613d7694ca280b4b9af59652b1a6 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 12 Sep 2024 23:49:48 -0400 Subject: [PATCH 001/161] Implement CMake upgrade and C++ standard deprecation as we did with C++11 --- cmake/CMakeLists.txt | 18 +++++++++++++++--- src/MAKE/Makefile.mpi | 6 +++--- src/MAKE/Makefile.serial | 6 +++--- src/lmptype.h | 7 +++++++ 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index c68a925324..b10823aba4 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -2,7 +2,7 @@ ######################################## # CMake build system # This file is part of LAMMPS -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.20) ######################################## # set policy to silence warnings about ignoring _ROOT but use it if(POLICY CMP0074) @@ -144,16 +144,28 @@ if((PKG_KOKKOS) AND (Kokkos_ENABLE_CUDA) AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL set(CMAKE_TUNE_DEFAULT "${CMAKE_TUNE_DEFAULT} -Xcudafe --diag_suppress=unrecognized_pragma") endif() -# we require C++11 without extensions. Kokkos requires at least C++17 (currently) +# we *require* C++11 without extensions but prefer C++17. +# Kokkos requires at least C++17 (currently) if(NOT CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + if(cxx_std_17 IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 17) + else() + set(CMAKE_CXX_STANDARD 11) + endif() endif() if(CMAKE_CXX_STANDARD LESS 11) message(FATAL_ERROR "C++ standard must be set to at least 11") endif() +if(CMAKE_CXX_STANDARD LESS 17) + message(WARNING "Selecting C++17 standard is preferred over C++${CMAKE_CXX_STANDARD}") +endif() if(PKG_KOKKOS AND (CMAKE_CXX_STANDARD LESS 17)) set(CMAKE_CXX_STANDARD 17) endif() +# turn off C++17 check in lmptype.h +if(LAMMPS_CXX11) + add_compile_definitions(LAMMPS_CXX11) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Use compiler extensions") # ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro diff --git a/src/MAKE/Makefile.mpi b/src/MAKE/Makefile.mpi index 9cd451a32c..8b21412e46 100644 --- a/src/MAKE/Makefile.mpi +++ b/src/MAKE/Makefile.mpi @@ -7,12 +7,12 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = mpicxx -CCFLAGS = -g -O3 -std=c++11 +CCFLAGS = -g -O3 # -std=c++17 SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -O3 -std=c++11 +LINKFLAGS = -g -O3 # -std=c++17 LIB = SIZE = size @@ -28,7 +28,7 @@ SHLIBFLAGS = -shared -rdynamic # LAMMPS ifdef settings # see possible settings in Section 3.5 of the manual -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64 # -DLAMMPS_CXX98 +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64 # -DLAMMPS_CXX11 # MPI library # see discussion in Section 3.4 of the manual diff --git a/src/MAKE/Makefile.serial b/src/MAKE/Makefile.serial index f588922a1f..a7406b7a6d 100644 --- a/src/MAKE/Makefile.serial +++ b/src/MAKE/Makefile.serial @@ -7,12 +7,12 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = g++ -CCFLAGS = -g -O3 -std=c++11 +CCFLAGS = -g -O3 # -std=c++17 SHFLAGS = -fPIC DEPFLAGS = -M LINK = g++ -LINKFLAGS = -g -O -std=c++11 +LINKFLAGS = -g -O # -std=c++17 LIB = SIZE = size @@ -28,7 +28,7 @@ SHLIBFLAGS = -shared -rdynamic # LAMMPS ifdef settings # see possible settings in Section 3.5 of the manual -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64 # -DLAMMPS_CXX98 +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_MEMALIGN=64 # -DLAMMPS_CXX11 # MPI library # see discussion in Section 3.4 of the manual diff --git a/src/lmptype.h b/src/lmptype.h index d2181c9898..d7ed016c8f 100644 --- a/src/lmptype.h +++ b/src/lmptype.h @@ -34,6 +34,13 @@ #error LAMMPS requires a C++11 (or later) compliant compiler. Enable C++11 compatibility or upgrade the compiler. #endif +// C++17 check +#ifndef LAMMPS_CXX11 +#if __cplusplus < 201703L +#error LAMMPS is planning to transition to C++17. To disable this error please use a C++17 compliant compiler, enable C++17 support, or define -DLAMMPS_CXX11 in your makefile or when running cmake +#endif +#endif + #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif From 0729c04dc14fd009b46d67bedacd5c205706f176 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 13 Sep 2024 22:38:20 -0400 Subject: [PATCH 002/161] document that GNU make build support is no longer required for new contributions --- doc/src/Modify_requirements.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/src/Modify_requirements.rst b/doc/src/Modify_requirements.rst index cbcb3eca13..c3e514a423 100644 --- a/doc/src/Modify_requirements.rst +++ b/doc/src/Modify_requirements.rst @@ -208,20 +208,21 @@ Build system (strict) LAMMPS currently supports two build systems: one that is based on :doc:`traditional Makefiles ` and one that is based on -:doc:`CMake `. Therefore, your contribution must be -compatible with and support both build systems. +:doc:`CMake `. As of fall 2024, it is no longer required +to support the traditional make build system. New packages may choose +to only support building with CMake. Additions to existing packages +must follow the requirements set by that package. For a single pair of header and implementation files that are an independent feature, it is usually only required to add them to ``src/.gitignore``. For traditional make, if your contributed files or package depend on -other LAMMPS style files or packages also being installed -(e.g. because your file is a derived class from the other LAMMPS -class), then an ``Install.sh`` file is also needed to check for those -dependencies and modifications to ``src/Depend.sh`` to trigger the checks. -See other README and Install.sh files in other directories as -examples. +other LAMMPS style files or packages also being installed (e.g. because +your file is a derived class from the other LAMMPS class), then an +``Install.sh`` file is also needed to check for those dependencies and +modifications to ``src/Depend.sh`` to trigger the checks. See other +README and Install.sh files in other directories as examples. Similarly, for CMake support, changes may need to be made to ``cmake/CMakeLists.txt``, some of the files in ``cmake/presets``, and From 3bed50c1c3192569e7be1d0390ed8c3ea1bb33a9 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 18 Sep 2024 15:18:41 -0400 Subject: [PATCH 003/161] Add text that we favor now CMake based builds --- doc/src/Build.rst | 12 ++++++++---- doc/src/Build_cmake.rst | 14 +++++++------- doc/src/Build_make.rst | 4 ++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/doc/src/Build.rst b/doc/src/Build.rst index 7cf2a01992..7ca8cd428e 100644 --- a/doc/src/Build.rst +++ b/doc/src/Build.rst @@ -1,10 +1,14 @@ Build LAMMPS ============ -LAMMPS is built as a library and an executable from source code using -either traditional makefiles for use with GNU make (which may require -manual editing), or using a build environment generated by CMake (Unix -Makefiles, Ninja, Xcode, Visual Studio, KDevelop, CodeBlocks and more). +LAMMPS is built as a library and an executable from source code using a +build environment generated by CMake (Unix Makefiles, Ninja, Xcode, +Visual Studio, KDevelop, CodeBlocks and more depending on the platform). +Using CMake is the preferred way to build LAMMPS. In addition, LAMMPS +can be compiled using the legacy build system based on traditional +makefiles for use with GNU make (which may require manual editing). +Support for the legacy build system is slowly being phased out and may +not be available for all optional features. As an alternative, you can download a package with pre-built executables or automated build trees, as described in the :doc:`Install ` diff --git a/doc/src/Build_cmake.rst b/doc/src/Build_cmake.rst index 1b2bef936e..5169f1039e 100644 --- a/doc/src/Build_cmake.rst +++ b/doc/src/Build_cmake.rst @@ -16,7 +16,7 @@ environments is on a :doc:`separate page `. .. note:: - LAMMPS currently requires that CMake version 3.16 or later is available. + LAMMPS currently requires that CMake version 3.20 or later is available. .. warning:: @@ -32,11 +32,11 @@ environments is on a :doc:`separate page `. Advantages of using CMake ^^^^^^^^^^^^^^^^^^^^^^^^^ -CMake is an alternative to compiling LAMMPS in the traditional way -through :doc:`(manually customized) makefiles `. Using -CMake has multiple advantages that are specifically helpful for -people with limited experience in compiling software or for people -that want to modify or extend LAMMPS. +CMake is the preferred way of compiling LAMMPS in contrast to the legacy +build system based on GNU make and through :doc:`(manually customized) +makefiles `. Using CMake has multiple advantages that are +specifically helpful for people with limited experience in compiling +software or for people that want to modify or extend LAMMPS. - CMake can detect available hardware, tools, features, and libraries and adapt the LAMMPS default build configuration accordingly. @@ -47,7 +47,7 @@ that want to modify or extend LAMMPS. knowledge of file formats or complex command line syntax is required. - All enabled components are compiled in a single build operation. - Automated dependency tracking for all files and configuration options. -- Support for true out-of-source compilation. Multiple configurations +- Support for true out-of-source compilation. Multiple configurations and settings with different choices of LAMMPS packages, settings, or compilers can be configured and built concurrently from the same source tree. diff --git a/doc/src/Build_make.rst b/doc/src/Build_make.rst index 932050d410..00f2f0b24d 100644 --- a/doc/src/Build_make.rst +++ b/doc/src/Build_make.rst @@ -8,6 +8,10 @@ Building LAMMPS with traditional makefiles requires that you have a for customizing your LAMMPS build with a number of global compilation options and features. +This build system is slowly being phased out and may not support all +optional features and packages in LAMMPS. It is recommended to switch +to the :doc:`CMake based build system `. + Requirements ^^^^^^^^^^^^ From 906ae818dacf8e581370eb253e4c2353c8f54d26 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 18 Sep 2024 15:43:56 -0400 Subject: [PATCH 004/161] add package removal warnings and GNU make deprecation warnings --- cmake/CMakeLists.txt | 9 +++++++++ src/ATC/Install.sh | 13 +++++++++++++ src/AWPMD/Install.sh | 13 +++++++++++++ src/COLVARS/Install.sh | 13 +++++++++++++ src/GPU/Install.sh | 13 +++++++++++++ src/LEPTON/Install.sh | 13 +++++++++++++ src/PLUMED/Install.sh | 13 +++++++++++++ src/POEMS/Install.sh | 13 +++++++++++++ 8 files changed, 100 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b10823aba4..9ec3996c64 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -358,6 +358,15 @@ foreach(PKG ${STANDARD_PACKAGES} ${SUFFIX_PACKAGES}) option(PKG_${PKG} "Build ${PKG} Package" OFF) endforeach() +set(DEPRECATED_PACKAGES AWPMD ATC POEMS) +foreach(PKG ${DEPRECATED_PACKAGES}) + message(WARNING + "The ${PKG} package will be removed from LAMMPS in Summer 2025 due to lack of " + "maintenance and use of code constructs that conflict with modern C++ compilers " + "and standards. Please contact developers@lammps.org if you have any concerns " + "about this step.") +endforeach() + ###################################################### # packages with special compiler needs or external libs ###################################################### diff --git a/src/ATC/Install.sh b/src/ATC/Install.sh index 2194685f92..04f4f7c8ac 100755 --- a/src/ATC/Install.sh +++ b/src/ATC/Install.sh @@ -9,6 +9,19 @@ mode=$1 LC_ALL=C export LC_ALL +cat < Date: Wed, 18 Sep 2024 16:22:34 -0400 Subject: [PATCH 005/161] deprecate COMPRESS and VTK package from GNU build --- src/COMPRESS/Install.sh | 13 +++++++++++++ src/VTK/Install.sh | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/COMPRESS/Install.sh b/src/COMPRESS/Install.sh index c0d926cba4..3c5d6d121a 100755 --- a/src/COMPRESS/Install.sh +++ b/src/COMPRESS/Install.sh @@ -7,6 +7,19 @@ mode=$1 LC_ALL=C export LC_ALL +cat < Date: Wed, 18 Sep 2024 16:38:16 -0400 Subject: [PATCH 006/161] deprecate ML-POD from using GNU make build --- src/ML-POD/Install.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/ML-POD/Install.sh b/src/ML-POD/Install.sh index a62887e1b3..ffc25b1420 100755 --- a/src/ML-POD/Install.sh +++ b/src/ML-POD/Install.sh @@ -7,6 +7,19 @@ mode=$1 LC_ALL=C export LC_ALL +cat < Date: Thu, 19 Sep 2024 09:59:53 -0400 Subject: [PATCH 007/161] ELECTRODE is going CMake-only as well --- src/ELECTRODE/Install.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/ELECTRODE/Install.sh b/src/ELECTRODE/Install.sh index 561df0dce0..6ece8467aa 100755 --- a/src/ELECTRODE/Install.sh +++ b/src/ELECTRODE/Install.sh @@ -11,6 +11,19 @@ mode=$1 LC_ALL=C export LC_ALL +cat < Date: Tue, 24 Sep 2024 10:32:44 +0800 Subject: [PATCH 008/161] mwlc angle potential --- src/.gitignore | 2 + src/EXTRA-MOLECULE/angle_mwlc.cpp | 311 ++++++++++++++++++++++++++++++ src/EXTRA-MOLECULE/angle_mwlc.h | 52 +++++ 3 files changed, 365 insertions(+) create mode 100644 src/EXTRA-MOLECULE/angle_mwlc.cpp create mode 100644 src/EXTRA-MOLECULE/angle_mwlc.h diff --git a/src/.gitignore b/src/.gitignore index e557a8cbb2..e58230e4c8 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -505,6 +505,8 @@ /angle_harmonic.h /angle_mm3.cpp /angle_mm3.h +/angle_mwlc.cpp +/angle_mwlc.h /angle_quartic.cpp /angle_quartic.h /angle_spica.cpp diff --git a/src/EXTRA-MOLECULE/angle_mwlc.cpp b/src/EXTRA-MOLECULE/angle_mwlc.cpp new file mode 100644 index 0000000000..41b1195bc4 --- /dev/null +++ b/src/EXTRA-MOLECULE/angle_mwlc.cpp @@ -0,0 +1,311 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: James D. Farrell (IoP CAS) + [ based on angle_cosine.cpp ] +------------------------------------------------------------------------- */ + +#include "angle_mwlc.h" + +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "math_const.h" +#include "memory.h" +#include "neighbor.h" + +#include +#include + +using namespace LAMMPS_NS; +using MathConst::MY_PI; + +/* ---------------------------------------------------------------------- */ + +AngleMWLC::AngleMWLC(LAMMPS *_lmp) : Angle(_lmp) +{ + born_matrix_enable = 1; +} + +/* ---------------------------------------------------------------------- */ + +AngleMWLC::~AngleMWLC() +{ + if (allocated && !copymode) { + memory->destroy(setflag); + memory->destroy(k1); + memory->destroy(k2); + memory->destroy(mu); + } +} + +/* ---------------------------------------------------------------------- */ + +void AngleMWLC::compute(int eflag, int vflag) +{ + int i1, i2, i3, n, type; + double delx1, dely1, delz1, delx2, dely2, delz2; + double eangle, f1[3], f3[3]; + double rsq1, rsq2, r1, r2, c, a, a11, a12, a22; + double q, qm, Q; + + eangle = 0.0; + ev_init(eflag, vflag); + + double **x = atom->x; + double **f = atom->f; + int **anglelist = neighbor->anglelist; + int nanglelist = neighbor->nanglelist; + int nlocal = atom->nlocal; + int newton_bond = force->newton_bond; + + for (n = 0; n < nanglelist; n++) { + i1 = anglelist[n][0]; + i2 = anglelist[n][1]; + i3 = anglelist[n][2]; + type = anglelist[n][3]; + + // 1st bond + + delx1 = x[i1][0] - x[i2][0]; + dely1 = x[i1][1] - x[i2][1]; + delz1 = x[i1][2] - x[i2][2]; + + rsq1 = delx1 * delx1 + dely1 * dely1 + delz1 * delz1; + r1 = sqrt(rsq1); + + // 2nd bond + + delx2 = x[i3][0] - x[i2][0]; + dely2 = x[i3][1] - x[i2][1]; + delz2 = x[i3][2] - x[i2][2]; + + rsq2 = delx2 * delx2 + dely2 * dely2 + delz2 * delz2; + r2 = sqrt(rsq2); + + // c = cosine of angle + + c = delx1 * delx2 + dely1 * dely2 + delz1 * delz2; + c /= r1 * r2; + if (c > 1.0) c = 1.0; + if (c < -1.0) c = -1.0; + + // force & energy + + q = exp(-k1[type] * (1.0 + c)); + qm = exp(-k2[type] * (1.0 + c) - mu[type]); + Q = q + qm; + + if (eflag) eangle = -log(Q); + + a = (k1[type] * q + k2[type] * qm) / Q; + a11 = a * c / rsq1; + a12 = -a / (r1 * r2); + a22 = a * c / rsq2; + + f1[0] = a11 * delx1 + a12 * delx2; + f1[1] = a11 * dely1 + a12 * dely2; + f1[2] = a11 * delz1 + a12 * delz2; + f3[0] = a22 * delx2 + a12 * delx1; + f3[1] = a22 * dely2 + a12 * dely1; + f3[2] = a22 * delz2 + a12 * delz1; + + // apply force to each of 3 atoms + + if (newton_bond || i1 < nlocal) { + f[i1][0] += f1[0]; + f[i1][1] += f1[1]; + f[i1][2] += f1[2]; + } + + if (newton_bond || i2 < nlocal) { + f[i2][0] -= f1[0] + f3[0]; + f[i2][1] -= f1[1] + f3[1]; + f[i2][2] -= f1[2] + f3[2]; + } + + if (newton_bond || i3 < nlocal) { + f[i3][0] += f3[0]; + f[i3][1] += f3[1]; + f[i3][2] += f3[2]; + } + + if (evflag) + ev_tally(i1, i2, i3, nlocal, newton_bond, eangle, f1, f3, delx1, dely1, delz1, delx2, dely2, + delz2); + } +} + +/* ---------------------------------------------------------------------- */ + +void AngleMWLC::allocate() +{ + allocated = 1; + const int np1 = atom->nangletypes + 1; + + memory->create(k1, np1, "angle:k1"); + memory->create(k2, np1, "angle:k2"); + memory->create(mu, np1, "angle:mu"); + memory->create(setflag, np1, "angle:setflag"); + for (int i = 1; i < np1; i++) setflag[i] = 0; +} + +/* ---------------------------------------------------------------------- + set coeffs for one type +------------------------------------------------------------------------- */ + +void AngleMWLC::coeff(int narg, char **arg) +{ + if (narg != 4) error->all(FLERR, "Incorrect args for angle coefficients"); + if (!allocated) allocate(); + + int ilo, ihi; + utils::bounds(FLERR, arg[0], 1, atom->nangletypes, ilo, ihi, error); + + double k1_one = utils::numeric(FLERR, arg[1], false, lmp); + double k2_one = utils::numeric(FLERR, arg[2], false, lmp); + double mu_one = utils::numeric(FLERR, arg[3], false, lmp); + + int count = 0; + for (int i = ilo; i <= ihi; i++) { + k1[i] = k1_one; + k2[i] = k2_one; + mu[i] = mu_one; + setflag[i] = 1; + count++; + } + + if (count == 0) error->all(FLERR, "Incorrect args for angle coefficients"); +} + +/* ---------------------------------------------------------------------- */ + +double AngleMWLC::equilibrium_angle(int /*i*/) +{ + return MY_PI; +} + +/* ---------------------------------------------------------------------- + proc 0 writes out coeffs to restart file +------------------------------------------------------------------------- */ + +void AngleMWLC::write_restart(FILE *fp) +{ + fwrite(&k1[1], sizeof(double), atom->nangletypes, fp); + fwrite(&k2[1], sizeof(double), atom->nangletypes, fp); + fwrite(&mu[1], sizeof(double), atom->nangletypes, fp); +} + +/* ---------------------------------------------------------------------- + proc 0 reads coeffs from restart file, bcasts them +------------------------------------------------------------------------- */ + +void AngleMWLC::read_restart(FILE *fp) +{ + allocate(); + + if (comm->me == 0) { + utils::sfread(FLERR, &k1[1], sizeof(double), atom->nangletypes, fp, nullptr, error); + utils::sfread(FLERR, &k2[1], sizeof(double), atom->nangletypes, fp, nullptr, error); + utils::sfread(FLERR, &mu[1], sizeof(double), atom->nangletypes, fp, nullptr, error); + } + MPI_Bcast(&k1[1], atom->nangletypes, MPI_DOUBLE, 0, world); + MPI_Bcast(&k2[1], atom->nangletypes, MPI_DOUBLE, 0, world); + MPI_Bcast(&mu[1], atom->nangletypes, MPI_DOUBLE, 0, world); + + for (int i = 1; i <= atom->nangletypes; i++) setflag[i] = 1; +} + +/* ---------------------------------------------------------------------- + proc 0 writes to data file +------------------------------------------------------------------------- */ + +void AngleMWLC::write_data(FILE *fp) +{ + for (int i = 1; i <= atom->nangletypes; i++) fprintf(fp, "%d %g %g %g\n", i, k1[i], k2[i], mu[i]); +} + +/* ---------------------------------------------------------------------- */ + +double AngleMWLC::single(int type, int i1, int i2, int i3) +{ + double **x = atom->x; + + double delx1 = x[i1][0] - x[i2][0]; + double dely1 = x[i1][1] - x[i2][1]; + double delz1 = x[i1][2] - x[i2][2]; + domain->minimum_image(delx1, dely1, delz1); + double r1 = sqrt(delx1 * delx1 + dely1 * dely1 + delz1 * delz1); + + double delx2 = x[i3][0] - x[i2][0]; + double dely2 = x[i3][1] - x[i2][1]; + double delz2 = x[i3][2] - x[i2][2]; + domain->minimum_image(delx2, dely2, delz2); + double r2 = sqrt(delx2 * delx2 + dely2 * dely2 + delz2 * delz2); + + double c = delx1 * delx2 + dely1 * dely2 + delz1 * delz2; + c /= r1 * r2; + if (c > 1.0) c = 1.0; + if (c < -1.0) c = -1.0; + + double q = exp(-k1[type] * (1.0 + c)); + double qm = exp(-k2[type] * (1.0 + c) - mu[type]); + return -log(q + qm); +} + +/* ---------------------------------------------------------------------- */ + +void AngleMWLC::born_matrix(int type, int i1, int i2, int i3, double &du, double &du2) +{ + double **x = atom->x; + + double delx1 = x[i1][0] - x[i2][0]; + double dely1 = x[i1][1] - x[i2][1]; + double delz1 = x[i1][2] - x[i2][2]; + domain->minimum_image(delx1, dely1, delz1); + + double delx2 = x[i3][0] - x[i2][0]; + double dely2 = x[i3][1] - x[i2][1]; + double delz2 = x[i3][2] - x[i2][2]; + domain->minimum_image(delx2, dely2, delz2); + + double c = delx1 * delx2 + dely1 * dely2 + delz1 * delz2; + c /= sqrt((delx1 * delx1 + dely1 * dely1 + delz1 * delz1) * + (delx2 * delx2 + dely2 * dely2 + delz2 * delz2)); + if (c > 1.0) c = 1.0; + if (c < -1.0) c = -1.0; + + const double q = exp(-k1[type] * (1.0 + c)); + const double qm = exp(-k2[type] * (1.0 + c) - mu[type]); + const double Q = q + qm; + + du = (k1[type] * q + k2[type] * qm) / Q; + du2 = (k1[type] - k2[type]) / Q; + du2 *= -du2 * q * qm; +} + +/* ---------------------------------------------------------------------- + return ptr to internal members upon request +------------------------------------------------------------------------ */ + +void *AngleMWLC::extract(const char *str, int &dim) +{ + dim = 1; + if (strcmp(str, "k1") == 0) return (void *) k1; + if (strcmp(str, "k2") == 0) return (void *) k2; + if (strcmp(str, "mu") == 0) return (void *) mu; + return nullptr; +} diff --git a/src/EXTRA-MOLECULE/angle_mwlc.h b/src/EXTRA-MOLECULE/angle_mwlc.h new file mode 100644 index 0000000000..512d1d1534 --- /dev/null +++ b/src/EXTRA-MOLECULE/angle_mwlc.h @@ -0,0 +1,52 @@ +/* -*- c++ -*- ---------------------------------------------------------- +LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef ANGLE_CLASS +// clang-format off +AngleStyle(mwlc,AngleMWLC); +// clang-format on +#else + +#ifndef LMP_ANGLE_MWLC_H +#define LMP_ANGLE_MWLC_H + +#include "angle.h" + +namespace LAMMPS_NS { + + class AngleMWLC : public Angle { + public: + AngleMWLC(class LAMMPS *); + ~AngleMWLC() override; + void compute(int, int) override; + void coeff(int, char **) override; + double equilibrium_angle(int) override; + void write_restart(FILE *) override; + void read_restart(FILE *) override; + void write_data(FILE *) override; + double single(int, int, int, int) override; + void born_matrix(int type, int i1, int i2, int i3, double &du, double &du2) override; + void *extract(const char *, int &) override; + + protected: + double *k1; + double *k2; + double *mu; + + virtual void allocate(); + }; + +} // namespace LAMMPS_NS + +#endif +#endif From 38e4df45efa6318dd6b2147132ce175492789159 Mon Sep 17 00:00:00 2001 From: farrelljd Date: Tue, 24 Sep 2024 10:33:08 +0800 Subject: [PATCH 009/161] mwlc angle documentation --- doc/src/Commands_bond.rst | 1 + doc/src/angle_mwlc.rst | 81 +++++++++++++++++++++ doc/src/angle_style.rst | 1 + doc/utils/sphinx-config/false_positives.txt | 7 ++ 4 files changed, 90 insertions(+) create mode 100644 doc/src/angle_mwlc.rst diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst index 3be40a6158..40532bdef7 100644 --- a/doc/src/Commands_bond.rst +++ b/doc/src/Commands_bond.rst @@ -90,6 +90,7 @@ OPT. * :doc:`lepton (o) ` * :doc:`mesocnt ` * :doc:`mm3 ` + * :doc:`mwlc ` * :doc:`quartic (o) ` * :doc:`spica (ko) ` * :doc:`table (o) ` diff --git a/doc/src/angle_mwlc.rst b/doc/src/angle_mwlc.rst new file mode 100644 index 0000000000..e70c518bd7 --- /dev/null +++ b/doc/src/angle_mwlc.rst @@ -0,0 +1,81 @@ +.. index:: angle_style mwlc + +angle_style mwlc command +========================== + +Syntax +"""""" + +.. code-block:: LAMMPS + + angle_style mwlc + +Examples +"""""""" + +.. code-block:: LAMMPS + + angle_style mwlc + angle_coeff * 25.0 1.0 10.0 + +Description +""""""""""" + +The *mwlc* angle style models a meltable wormlike chain, using a potential that is a canonical-ensemble superposition of +a non-melted and a melted state :ref:`(Farrell) `, + +.. math:: + + \beta E = -\log [q + q^{m}], + +where + +.. math:: + q = \exp [-l_{p}(1-\cos{\theta})/\sigma], \\ + q^{m} = \exp [-\beta\mu-l_{p}^{m}(1-\cos{\theta})/\sigma], + +:math:`l_{p}` is the persistence length of the non-melted state, +:math:`l_{p}^{m}` is the persistence length of the melted state, +and :math:`\mu` is the melting energy. + +This potential is a continuous version of the two-state potential +introduced by :ref:`(Yan) `. + +The following coefficients must be defined for each angle type via the +:doc:`angle_coeff ` command as in the example above, or in +the data file or restart files read by the :doc:`read_data ` +or :doc:`read_restart ` commands: + +* :math:`l_{p}` (distance) +* :math:`l_{p}^{m}` (distance) +* :math:`\mu` (energy) + +---------- + + +Restrictions +"""""""""""" + +This angle style can only be used if LAMMPS was built with the +EXTRA-MOLECULE package. See the :doc:`Build package ` doc page +for more info. + +Related commands +"""""""""""""""" + +:doc:`angle_coeff ` + +Default +""""""" + +none + +---------- + +.. _Farrell: + +**(Farrell)** Farrell, Dobnikar, Podgornik, Curk, Phys Rev Lett, in production. + +.. _Yan: + +**(Yan)** Yan, Marko, Phys Rev Lett, 93, 108108 (2004). diff --git a/doc/src/angle_style.rst b/doc/src/angle_style.rst index dc16a3fbaa..ab7671c75d 100644 --- a/doc/src/angle_style.rst +++ b/doc/src/angle_style.rst @@ -94,6 +94,7 @@ of (g,i,k,o,t) to indicate which accelerated styles exist. * :doc:`lepton ` - angle potential from evaluating a string * :doc:`mesocnt ` - piecewise harmonic and linear angle for bending-buckling of nanotubes * :doc:`mm3 ` - anharmonic angle +* :doc:`mwlc ` - meltable wormlike chain * :doc:`quartic ` - angle with cubic and quartic terms * :doc:`spica ` - harmonic angle with repulsive SPICA pair style between 1-3 atoms * :doc:`table ` - tabulated by angle diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 70d6b4e323..159d6e4f1f 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -711,6 +711,7 @@ Dasgupta dashpot dat datafile +dataset datatype datums Davidchack @@ -2109,6 +2110,7 @@ Marchi Mariella Marinica Markland +Marko Marrink Marroquin Marsaglia @@ -2191,6 +2193,7 @@ Meissner Melchor Meloni Melrose +meltable mem Mem memalign @@ -2399,6 +2402,7 @@ mV Mvapich mvh mvv +mwlc MxN myCompute myIndex @@ -2919,6 +2923,7 @@ Pmoltrans pN png podd +Podgornik Podhorszki Poiseuille poisson @@ -4100,6 +4105,7 @@ workflow workflows Workum Worley +wormlike Wriggers writedata Wuppertal @@ -4168,6 +4174,7 @@ yaff YAFF Yamada yaml +Yan Yanxon Yaser Yazdani From e06be9b030d00cc06e1accd54e27db9dd7e6ce0e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 3 Oct 2024 15:39:33 -0400 Subject: [PATCH 010/161] propagate new c++ standard handling from main CMakeLists.txt to plugin version --- examples/plugins/CMakeLists.txt | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/plugins/CMakeLists.txt b/examples/plugins/CMakeLists.txt index 8771b29121..60cbd01d73 100644 --- a/examples/plugins/CMakeLists.txt +++ b/examples/plugins/CMakeLists.txt @@ -42,8 +42,28 @@ else() endif() endif() -# C++11 is required -set(CMAKE_CXX_STANDARD 11) +# we *require* C++11 without extensions but prefer C++17. +# Kokkos requires at least C++17 (currently) +if(NOT CMAKE_CXX_STANDARD) + if(cxx_std_17 IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 17) + else() + set(CMAKE_CXX_STANDARD 11) + endif() +endif() +if(CMAKE_CXX_STANDARD LESS 11) + message(FATAL_ERROR "C++ standard must be set to at least 11") +endif() +if(CMAKE_CXX_STANDARD LESS 17) + message(WARNING "Selecting C++17 standard is preferred over C++${CMAKE_CXX_STANDARD}") +endif() +if(PKG_KOKKOS AND (CMAKE_CXX_STANDARD LESS 17)) + set(CMAKE_CXX_STANDARD 17) +endif() +# turn off C++17 check in lmptype.h +if(LAMMPS_CXX11) + add_compile_definitions(LAMMPS_CXX11) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) # Need -restrict with Intel compilers From 0a3d213ed9073ae6f2694e596d1ffa99fdc7f1d0 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 4 Oct 2024 16:02:53 -0400 Subject: [PATCH 011/161] turn hard requirement for CMake 3.20 into a warning for now --- cmake/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index a8a447e2d8..61bab2bee2 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -2,7 +2,10 @@ ######################################## # CMake build system # This file is part of LAMMPS -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.16) +if(CMAKE_VERSION VERSION_LESS 3.20) + message(WARNING "LAMMPS is planning require at least CMake version 3.20 by Summer 2025. Please upgrade!") +endif() ######################################## # set policy to silence warnings about ignoring _ROOT but use it if(POLICY CMP0074) From 166f0cb5eaa0834c2a2f7eb72e707da0b845ff7b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sat, 5 Oct 2024 01:04:26 -0400 Subject: [PATCH 012/161] print warning only when package selected --- cmake/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 7ff8eb6abf..cbc8a100fb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -364,11 +364,13 @@ endforeach() set(DEPRECATED_PACKAGES AWPMD ATC POEMS) foreach(PKG ${DEPRECATED_PACKAGES}) - message(WARNING + if(PKG_${PKG}) + message(WARNING "The ${PKG} package will be removed from LAMMPS in Summer 2025 due to lack of " "maintenance and use of code constructs that conflict with modern C++ compilers " "and standards. Please contact developers@lammps.org if you have any concerns " "about this step.") + endif() endforeach() ###################################################### From a73baf81b1decaeec47955035e0aea40870c201c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sat, 5 Oct 2024 01:04:39 -0400 Subject: [PATCH 013/161] update settings --- cmake/presets/oneapi.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/presets/oneapi.cmake b/cmake/presets/oneapi.cmake index 393d1d9b68..403494c409 100644 --- a/cmake/presets/oneapi.cmake +++ b/cmake/presets/oneapi.cmake @@ -18,11 +18,11 @@ set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE) unset(HAVE_OMP_H_INCLUDE CACHE) set(OpenMP_C "icx" CACHE STRING "" FORCE) -set(OpenMP_C_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE) +set(OpenMP_C_FLAGS "-qopenmp" CACHE STRING "" FORCE) set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE) set(OpenMP_CXX "icpx" CACHE STRING "" FORCE) -set(OpenMP_CXX_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE) +set(OpenMP_CXX_FLAGS "-qopenmp" CACHE STRING "" FORCE) set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE) -set(OpenMP_Fortran_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE) +set(OpenMP_Fortran_FLAGS "-qopenmp" CACHE STRING "" FORCE) set(OpenMP_omp_LIBRARY "libiomp5.so" CACHE PATH "" FORCE) From 74e449605a161780e80a4d709ca2cfa53322d713 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 15 Oct 2024 14:48:30 -0400 Subject: [PATCH 014/161] add warning to PyLammps that it will be removed soon --- python/lammps/pylammps.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py index 1f5a1a0db9..3f1ac2e2b3 100644 --- a/python/lammps/pylammps.py +++ b/python/lammps/pylammps.py @@ -468,6 +468,9 @@ class PyLammps(object): self.comm_nprocs = self.lmp.extract_setting("world_size") self.comm_me = self.lmp.extract_setting("world_rank") if self.comm_me == 0: + print("\nWARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING") + print("WARNING: The PyLammps class is obsolete and will be removed from LAMMPS soon.") + print("WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING\n") print("LAMMPS output is captured by PyLammps wrapper") if self.comm_nprocs > 1: print("WARNING: Using PyLammps with multiple MPI ranks is experimental. Not all functionality is supported.") From cfb8b25c6ef6accc628c04b9fe6cb3d020634d79 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 24 Oct 2024 13:33:42 -0400 Subject: [PATCH 015/161] fix grammar --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 99048778ae..cda833944e 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -4,7 +4,7 @@ # This file is part of LAMMPS cmake_minimum_required(VERSION 3.16) if(CMAKE_VERSION VERSION_LESS 3.20) - message(WARNING "LAMMPS is planning require at least CMake version 3.20 by Summer 2025. Please upgrade!") + message(WARNING "LAMMPS is planning to require at least CMake version 3.20 by Summer 2025. Please upgrade!") endif() ######################################## # set policy to silence warnings about ignoring _ROOT but use it From 15999f6518249d6b3104c502b1928dda0fca476f Mon Sep 17 00:00:00 2001 From: ljwoods2 Date: Tue, 12 Nov 2024 14:30:10 -0700 Subject: [PATCH 016/161] initial commit --- src/MISC/fix_imd.cpp | 779 +++++++++++++++++++++++++++++++++++++------ src/MISC/fix_imd.h | 26 +- 2 files changed, 704 insertions(+), 101 deletions(-) diff --git a/src/MISC/fix_imd.cpp b/src/MISC/fix_imd.cpp index 41ce97ee65..c295fdfaf8 100644 --- a/src/MISC/fix_imd.cpp +++ b/src/MISC/fix_imd.cpp @@ -76,6 +76,7 @@ negotiate an appropriate license for such distribution." #endif #include +#include using namespace LAMMPS_NS; using namespace FixConst; @@ -358,7 +359,6 @@ typedef struct { } IMDheader; #define IMDHEADERSIZE 8 -#define IMDVERSION 2 typedef enum IMDType_t { IMD_DISCONNECT, /**< close IMD connection, leaving sim running */ @@ -370,7 +370,15 @@ typedef enum IMDType_t { IMD_MDCOMM, /**< MDComm style force data */ IMD_PAUSE, /**< pause the running simulation */ IMD_TRATE, /**< set IMD update transmission rate */ - IMD_IOERROR /**< indicate an I/O error */ + IMD_IOERROR, /**< indicate an I/O error */ + /* IMDv3 only headers */ + IMD_SESSIONINFO, + IMD_RESUME, + IMD_TIME, + IMD_BOX, + IMD_VELOCITIES, + IMD_FORCES, + IMD_WAIT, } IMDType; /**< IMD command message type enumerations */ typedef struct { @@ -386,8 +394,20 @@ typedef struct { float Eimpr; /**< Improper energy, Kcal/mol */ } IMDEnergies; /**< IMD simulation energy report structure */ +/* IMDv3 only */ +typedef struct IMDSessionInfo { + bool time; + bool box; + bool coords; + bool unwrap; + bool velocities; + bool forces; + bool energies; +} IMDSessionInfo; + /** Send control messages - these consist of a header with no subsequent data */ -static int imd_handshake(void *); /**< check endianness, version compat */ +static int imd_handshake_v2(void *); /**< check endianness, version compat */ +static int imd_handshake_v3(void *, IMDSessionInfo *); /** Receive header and data */ static IMDType imd_recv_header(void *, int32 *); /** Receive MDComm-style forces, units are Kcal/mol/angstrom */ @@ -426,19 +446,22 @@ static void imdsock_destroy(void *); * The implementation follows at the end of the file. * ***************************************************************/ -/* struct for packed data communication of coordinates and forces. */ +/* struct for packed data communication of coordinates, velocities, and forces. */ struct commdata { tagint tag; float x,y,z; }; +MPI_Datatype MPI_CommData; + /*************************************************************** * create class and parse arguments in LAMMPS script. Syntax: - * fix ID group-ID imd [unwrap (on|off)] [fscale ] + * fix ID group-ID imd [version (2|3)] [unwrap (on|off)] [fscale ] [time (on|off)] [box (on|off)] [coordinates (on|off)] [velocities (on|off)] [forces (on|off)] ***************************************************************/ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { + if (narg < 4) error->all(FLERR,"Illegal fix imd command"); @@ -447,12 +470,21 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : error->all(FLERR,"Illegal fix imd parameter: port < 1024"); /* default values for optional flags */ + imd_version = 2; + unwrap_flag = 0; nowait_flag = 0; connect_msg = 1; imd_fscale = 1.0; imd_trate = 1; + /* IMDv3-only flags. Aren't stored as class attributes since they're converted into IMDSessionInfo */ + int time_flag = 1; + int box_flag = 1; + int coord_flag = 1; + int vel_flag = 1; + int force_flag = 1; + /* parse optional arguments */ int iarg = 4; while (iarg+1 < narg) { @@ -464,9 +496,19 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : imd_fscale = utils::numeric(FLERR,arg[iarg+1],false,lmp); } else if (0 == strcmp(arg[iarg], "trate")) { imd_trate = utils::inumeric(FLERR,arg[iarg+1],false,lmp); - } else { - error->all(FLERR,"Unknown fix imd parameter"); - } + } else if (0 == strcmp(arg[iarg], "version")) { + imd_version = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + } else if (0 == strcmp(arg[iarg], "time")) { + time_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); + } else if (0 == strcmp(arg[iarg], "box")) { + box_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); + } else if (0 == strcmp(arg[iarg], "coordinates")) { + coord_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); + } else if (0 == strcmp(arg[iarg], "velocities")) { + vel_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); + } else if (0 == strcmp(arg[iarg], "forces")) { + force_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); + } else error->all(FLERR,"Unknown fix imd parameter"); ++iarg; ++iarg; } @@ -474,12 +516,43 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : if (imd_trate < 1) error->all(FLERR,"Illegal fix imd parameter. trate < 1."); + if (imd_version != 2 && imd_version != 3) + error->all(FLERR,"Illegal fix imd parameter. version != 2 or 3."); + + imdsinfo = new IMDSessionInfo; + + /* In IMDv2 in LAMMPS, only coordinates are sent*/ + if (imd_version == 2) { + imdsinfo->time = false; + imdsinfo->box = false; + imdsinfo->coords = true; + imdsinfo->unwrap = unwrap_flag; + imdsinfo->velocities = false; + imdsinfo->forces = false; + imdsinfo->energies = false; + } + + if (imd_version == 3) { + imdsinfo->time = time_flag; + imdsinfo->box = box_flag; + imdsinfo->coords = coord_flag; + imdsinfo->unwrap = unwrap_flag; + imdsinfo->velocities = vel_flag; + imdsinfo->forces = force_flag; + imdsinfo->energies = false; + } + + bigint n = group->count(igroup); if (n > MAXSMALLINT) error->all(FLERR,"Too many atoms for fix imd"); num_coords = static_cast (n); - MPI_Comm_rank(world,&me); + // Define MPI dtype for passing x/v/f data + MPI_Type_contiguous(4, MPI_FLOAT, &MPI_CommData); + MPI_Type_commit(&MPI_CommData); + MPI_Comm_rank(world,&me); + /* initialize various imd state variables. */ clientsock = nullptr; localsock = nullptr; @@ -487,14 +560,33 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : imd_inactive = 0; imd_terminate = 0; imd_forces = 0; - force_buf = nullptr; + recv_force_buf = nullptr; maxbuf = 0; - msgdata = nullptr; - msglen = 0; - comm_buf = nullptr; + coord_data = nullptr; + vel_data = nullptr; + force_data = nullptr; idmap = nullptr; rev_idmap = nullptr; + + msglen = 0; + if (imdsinfo->time) { + msglen += 24+IMDHEADERSIZE; + } + if (imdsinfo->box) { + msglen += 9*4+IMDHEADERSIZE; + } + if (imdsinfo->coords) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->velocities) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->forces) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + msgdata = new char[msglen]; + if (me == 0) { /* set up incoming socket on MPI rank 0. */ imdsock_init(); @@ -512,7 +604,7 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : if (imd_terminate) error->all(FLERR,"LAMMPS Terminated on error in IMD."); - /* storage required to communicate a single coordinate or force. */ + /* storage required to communicate a single coordinate, velocity, or force. */ size_one = sizeof(struct commdata); #if defined(LAMMPS_ASYNC_IMD) @@ -546,7 +638,6 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : *********************************/ FixIMD::~FixIMD() { - #if defined(LAMMPS_ASYNC_IMD) if (me == 0) { pthread_mutex_lock(&write_mutex); @@ -561,13 +652,17 @@ FixIMD::~FixIMD() pthread_cond_destroy(&write_cond); } #endif - auto hashtable = (taginthash_t *)idmap; - memory->destroy(comm_buf); - memory->destroy(force_buf); + memory->destroy(coord_data); + memory->destroy(vel_data); + memory->destroy(force_data); + + memory->destroy(msgdata); + memory->destroy(recv_force_buf); taginthash_destroy(hashtable); delete hashtable; free(rev_idmap); + free(imdsinfo); // close sockets imdsock_shutdown(clientsock); imdsock_destroy(clientsock); @@ -583,6 +678,7 @@ int FixIMD::setmask() int mask = 0; mask |= POST_FORCE; mask |= POST_FORCE_RESPA; + mask |= END_OF_STEP; return mask; } @@ -611,6 +707,7 @@ int FixIMD::reconnect() } else { fprintf(screen,"Waiting for IMD connection on port %d. Transfer rate %d.\n",imd_port, imd_trate); } + fflush(screen); } connect_msg = 0; clientsock = nullptr; @@ -637,7 +734,8 @@ int FixIMD::reconnect() return 0; } else { /* check endianness and IMD protocol version. */ - if (imd_handshake(clientsock)) { + if ((imd_version == 2 && imd_handshake_v2(clientsock)) || + (imd_version == 3 && imd_handshake_v3(clientsock, imdsinfo))) { if (screen) fprintf(screen, "IMD handshake error. Dropping connection.\n"); imdsock_destroy(clientsock); @@ -680,9 +778,21 @@ void FixIMD::setup(int) if (mask[i] & groupbit) ++nme; MPI_Allreduce(&nme,&nmax,1,MPI_INT,MPI_MAX,world); - memory->destroy(comm_buf); + maxbuf = nmax*size_one; - comm_buf = (void *) memory->smalloc(maxbuf,"imd:comm_buf"); + + if (imdsinfo->coords) { + memory->destroy(coord_data); + coord_data = (void *) memory->smalloc(maxbuf,"imd:coord_data"); + } + if (imdsinfo->velocities) { + memory->destroy(vel_data); + vel_data = (void *) memory->smalloc(maxbuf,"imd:vel_data"); + } + if (imdsinfo->forces) { + memory->destroy(force_data); + force_data = (void *) memory->smalloc(maxbuf,"imd:force_data"); + } connect_msg = 1; reconnect(); @@ -697,11 +807,11 @@ void FixIMD::setup(int) idmap = (void *)hashtable; int tmp, ndata; - auto buf = static_cast(comm_buf); + auto buf = static_cast(coord_data); if (me == 0) { - MPI_Status status; - MPI_Request request; + std::vector statuses; + std::vector requests; auto taglist = new tagint[num_coords]; int numtag=0; /* counter to map atom tags to a 0-based consecutive index list */ @@ -714,15 +824,39 @@ void FixIMD::setup(int) /* loop over procs to receive remote data */ for (i=1; i < comm->nprocs; ++i) { - MPI_Irecv(comm_buf, maxbuf, MPI_BYTE, i, 0, world, &request); - MPI_Send(&tmp, 0, MPI_INT, i, 0, world); - MPI_Wait(&request, &status); - MPI_Get_count(&status, MPI_BYTE, &ndata); - ndata /= size_one; + /* We're assuming tags are consistent across x,v,f */ + bool tag_recvd = false; + statuses.clear(); + requests.clear(); - for (j=0; j < ndata; ++j) { - taglist[numtag] = buf[j].tag; - ++numtag; + if (imdsinfo->coords) { + requests.push_back(MPI_Request()); + MPI_Irecv(coord_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); + } + if (imdsinfo->velocities) { + requests.push_back(MPI_Request()); + MPI_Irecv(vel_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); + } + if (imdsinfo->forces) { + requests.push_back(MPI_Request()); + MPI_Irecv(vel_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); + } + statuses.resize(requests.size()); + MPI_Send(&tmp, 0, MPI_INT, i, 0, world); + MPI_Waitall(requests.size(), requests.data(), statuses.data()); + + for (size_t k=0; k < statuses.size(); ++k) { + if (!tag_recvd) { + MPI_Get_count(&statuses[k], MPI_BYTE, &ndata); + ndata /= size_one; + for (j=0; j < ndata; ++j) { + taglist[numtag] = buf[j].tag; + ++numtag; + } + tag_recvd = true; + } else { + break; + } } } @@ -747,7 +881,15 @@ void FixIMD::setup(int) } /* blocking receive to wait until it is our turn to send data. */ MPI_Recv(&tmp, 0, MPI_INT, 0, 0, world, MPI_STATUS_IGNORE); - MPI_Rsend(comm_buf, nme*size_one, MPI_BYTE, 0, 0, world); + if (imdsinfo->coords) { + MPI_Rsend(coord_data, nme*size_one, MPI_BYTE, 0, 0, world); + } + if (imdsinfo->velocities) { + MPI_Rsend(vel_data, nme*size_one, MPI_BYTE, 0, 0, world); + } + if (imdsinfo->forces) { + MPI_Rsend(force_data, nme*size_one, MPI_BYTE, 0, 0, world); + } } } @@ -795,6 +937,39 @@ void FixIMD::ioworker() * Send coodinates, energies, and add IMD forces to atoms. */ void FixIMD::post_force(int /*vflag*/) { + fflush(screen); + if (imd_version == 2) { + handle_step_v2(); + } + else if (imd_version == 3) { + handle_client_input_v3(); + } + + } + +/* ---------------------------------------------------------------------- */ +void FixIMD::post_force_respa(int vflag, int ilevel, int /*iloop*/) +{ + /* only process IMD on the outmost RESPA level. */ + if (ilevel == nlevels_respa-1) post_force(vflag); +} + +void FixIMD::end_of_step() +{ + if (imd_version == 3 && update->ntimestep % imd_trate == 0) { + handle_output_v3(); + } +} + +/* ---------------------------------------------------------------------- */ +/* local memory usage. approximately. */ +double FixIMD::memory_usage() +{ + return static_cast(num_coords+maxbuf+imd_forces)*size_one; +} + +void FixIMD::handle_step_v2() { + /* check for reconnect */ if (imd_inactive) { reconnect(); @@ -825,22 +1000,12 @@ void FixIMD::post_force(int /*vflag*/) switch(msg) { - case IMD_GO: - if (screen) - fprintf(screen, "Ignoring unexpected IMD_GO message.\n"); - break; - - case IMD_IOERROR: - if (screen) - fprintf(screen, "IMD connection lost.\n"); - /* fallthrough */ - case IMD_DISCONNECT: { /* disconnect from client. wait for new connection. */ imd_paused = 0; imd_forces = 0; - memory->destroy(force_buf); - force_buf = nullptr; + memory->destroy(recv_force_buf); + recv_force_buf = nullptr; imdsock_destroy(clientsock); clientsock = nullptr; if (screen) @@ -866,9 +1031,9 @@ void FixIMD::post_force(int /*vflag*/) case IMD_PAUSE: /* pause the running simulation. wait for second IMD_PAUSE to continue. */ if (imd_paused) { - if (screen) - fprintf(screen, "Continuing run on IMD client request.\n"); - imd_paused = 0; + if (screen) + fprintf(screen, "Continuing run on IMD client request.\n"); + imd_paused = 0; } else { if (screen) fprintf(screen, "Pausing run on IMD client request.\n"); @@ -884,31 +1049,18 @@ void FixIMD::post_force(int /*vflag*/) fprintf(screen, "IMD client requested change of transfer rate. Now it is %d.\n", imd_trate); break; - case IMD_ENERGIES: { - IMDEnergies dummy_energies; - imd_recv_energies(clientsock, &dummy_energies); - break; - } - - case IMD_FCOORDS: { - auto dummy_coords = new float[3*length]; - imd_recv_fcoords(clientsock, length, dummy_coords); - delete[] dummy_coords; - break; - } - case IMD_MDCOMM: { auto imd_tags = new int32[length]; auto imd_fdat = new float[3*length]; imd_recv_mdcomm(clientsock, length, imd_tags, imd_fdat); if (imd_forces < length) { /* grow holding space for forces, if needed. */ - memory->destroy(force_buf); - force_buf = (void *) memory->smalloc((bigint)length*size_one, - "imd:force_buf"); + memory->destroy(recv_force_buf); + recv_force_buf = (void *) memory->smalloc((bigint)length*size_one, + "imd:recv_force_buf"); } imd_forces = length; - buf = static_cast(force_buf); + buf = static_cast(recv_force_buf); /* compare data to hash table */ for (int ii=0; ii < length; ++ii) { @@ -943,12 +1095,12 @@ void FixIMD::post_force(int /*vflag*/) /* check if we need to readjust the forces comm buffer on the receiving nodes. */ if (me != 0) { if (old_imd_forces < imd_forces) { /* grow holding space for forces, if needed. */ - if (force_buf != nullptr) - memory->sfree(force_buf); - force_buf = memory->smalloc((bigint)imd_forces*size_one, "imd:force_buf"); + if (recv_force_buf != nullptr) + memory->sfree(recv_force_buf); + recv_force_buf = memory->smalloc((bigint)imd_forces*size_one, "imd:recv_force_buf"); } } - MPI_Bcast(force_buf, imd_forces*size_one, MPI_BYTE, 0, world); + MPI_Bcast(recv_force_buf, imd_forces*size_one, MPI_BYTE, 0, world); } /* Check if we need to communicate coordinates to the client. @@ -961,7 +1113,7 @@ void FixIMD::post_force(int /*vflag*/) if (update->ntimestep % imd_trate) { if (imd_forces > 0) { double **f = atom->f; - buf = static_cast(force_buf); + buf = static_cast(recv_force_buf); /* XXX. this is in principle O(N**2) == not good. * however we assume for now that the number of atoms @@ -989,27 +1141,25 @@ void FixIMD::post_force(int /*vflag*/) MPI_Allreduce(&nme,&nmax,1,MPI_INT,MPI_MAX,world); if (nmax*size_one > maxbuf) { - memory->destroy(comm_buf); + memory->destroy(coord_data); maxbuf = nmax*size_one; - comm_buf = memory->smalloc(maxbuf,"imd:comm_buf"); + coord_data = memory->smalloc(maxbuf,"imd:coord_data"); } int tmp, ndata; - buf = static_cast(comm_buf); + buf = static_cast(coord_data); if (me == 0) { MPI_Status status; MPI_Request request; /* collect data into new array. we bypass the IMD API to save * us one extra copy of the data. */ - msglen = 3*sizeof(float)*num_coords+IMDHEADERSIZE; - msgdata = new char[msglen]; imd_fill_header((IMDheader *)msgdata, IMD_FCOORDS, num_coords); /* array pointer, to the offset where we receive the coordinates. */ - auto recvcoord = (float *) (msgdata+IMDHEADERSIZE); + auto recvcoord = (float *) (msgdata+IMDHEADERSIZE); /* add local data */ - if (unwrap_flag) { + if (imdsinfo->unwrap) { double xprd = domain->xprd; double yprd = domain->yprd; double zprd = domain->zprd; @@ -1049,10 +1199,9 @@ void FixIMD::post_force(int /*vflag*/) } } } - /* loop over procs to receive remote data */ for (i=1; i < comm->nprocs; ++i) { - MPI_Irecv(comm_buf, maxbuf, MPI_BYTE, i, 0, world, &request); + MPI_Irecv(coord_data, maxbuf, MPI_BYTE, i, 0, world, &request); MPI_Send(&tmp, 0, MPI_INT, i, 0, world); MPI_Wait(&request, &status); MPI_Get_count(&status, MPI_BYTE, &ndata); @@ -1067,7 +1216,6 @@ void FixIMD::post_force(int /*vflag*/) } } } - /* done collecting frame data now communicate with IMD client. */ #if defined(LAMMPS_ASYNC_IMD) @@ -1081,13 +1229,12 @@ void FixIMD::post_force(int /*vflag*/) if (clientsock && imdsock_selwrite(clientsock,0)) { imd_writen(clientsock, msgdata, msglen); } - delete[] msgdata; #endif } else { /* copy coordinate data into communication buffer */ nme = 0; - if (unwrap_flag) { + if (imdsinfo->unwrap) { double xprd = domain->xprd; double yprd = domain->yprd; double zprd = domain->zprd; @@ -1128,23 +1275,439 @@ void FixIMD::post_force(int /*vflag*/) } /* blocking receive to wait until it is our turn to send data. */ MPI_Recv(&tmp, 0, MPI_INT, 0, 0, world, MPI_STATUS_IGNORE); - MPI_Rsend(comm_buf, nme*size_one, MPI_BYTE, 0, 0, world); + MPI_Rsend(coord_data, nme*size_one, MPI_BYTE, 0, 0, world); } - } - -/* ---------------------------------------------------------------------- */ -void FixIMD::post_force_respa(int vflag, int ilevel, int /*iloop*/) -{ - /* only process IMD on the outmost RESPA level. */ - if (ilevel == nlevels_respa-1) post_force(vflag); } -/* ---------------------------------------------------------------------- */ -/* local memory usage. approximately. */ -double FixIMD::memory_usage() -{ - return static_cast(num_coords+maxbuf+imd_forces)*size_one; +void FixIMD::handle_client_input_v3() { + // IMDV3 + + /* check for reconnect */ + if (imd_inactive) { + reconnect(); + MPI_Bcast(&imd_inactive, 1, MPI_INT, 0, world); + MPI_Bcast(&imd_terminate, 1, MPI_INT, 0, world); + if (imd_terminate) + error->all(FLERR,"LAMMPS terminated on error in setting up IMD connection."); + if (imd_inactive) + return; /* IMD client has detached and not yet come back. do nothing. */ + } + + struct commdata *buf; + int nlocal = atom->nlocal; + int *mask = atom->mask; + tagint *tag = atom->tag; + double **f = atom->f; + + if (me == 0) { + /* process all pending incoming data. */ + int imd_paused=0; + while ((imdsock_selread(clientsock, 0) > 0) || imd_paused) { + /* if something requested to turn off IMD while paused get out */ + if (imd_inactive) break; + + int32 length; + int msg = imd_recv_header(clientsock, &length); + + switch(msg) { + + case IMD_DISCONNECT: { + /* disconnect from client. wait for new connection. */ + imd_paused = 0; + imd_forces = 0; + memory->destroy(recv_force_buf); + recv_force_buf = nullptr; + imdsock_destroy(clientsock); + clientsock = nullptr; + if (screen) + fprintf(screen, "IMD client detached. LAMMPS run continues.\n"); + + connect_msg = 1; + reconnect(); + if (imd_terminate) imd_inactive = 1; + break; + } + + case IMD_KILL: + /* stop the simulation job and shutdown IMD */ + if (screen) + fprintf(screen, "IMD client requested termination of run.\n"); + imd_inactive = 1; + imd_terminate = 1; + imd_paused = 0; + imdsock_destroy(clientsock); + clientsock = nullptr; + break; + + case IMD_PAUSE: + /* pause the running simulation. wait for second IMD_PAUSE to continue. */ + if (!imd_paused) { + if (screen) + fprintf(screen, "Pausing run on IMD client request.\n"); + imd_paused = 1; + } else { + // Pause in IMDv3 is idempotent + continue; + } + break; + + case IMD_RESUME: + /* resume the running simulation. */ + if (imd_paused) { + if (screen) + fprintf(screen, "Continuing run on IMD client request.\n"); + imd_paused = 0; + } else { + // Resume in IMDv3 is idempotent + continue; + } + break; + + case IMD_TRATE: + /* change the IMD transmission data rate */ + if (length > 0) + imd_trate = length; + if (screen) + fprintf(screen, "IMD client requested change of transfer rate. Now it is %d.\n", imd_trate); + break; + + case IMD_MDCOMM: { + auto imd_tags = new int32[length]; + auto imd_fdat = new float[3*length]; + imd_recv_mdcomm(clientsock, length, imd_tags, imd_fdat); + + if (imd_forces < length) { /* grow holding space for forces, if needed. */ + memory->destroy(recv_force_buf); + recv_force_buf = (void *) memory->smalloc((bigint)length*size_one, + "imd:recv_force_buf"); + } + imd_forces = length; + buf = static_cast(recv_force_buf); + + /* compare data to hash table */ + for (int ii=0; ii < length; ++ii) { + buf[ii].tag = rev_idmap[imd_tags[ii]]; + buf[ii].x = imd_fdat[3*ii]; + buf[ii].y = imd_fdat[3*ii+1]; + buf[ii].z = imd_fdat[3*ii+2]; + } + delete[] imd_tags; + delete[] imd_fdat; + break; + } + case IMD_WAIT: { + /* Change IMD waiting behavior mid-session */ + if (length) { + nowait_flag = 0; + } + else { + nowait_flag = 1; + } + break; + } + + default: + if (screen) + fprintf(screen, "Unhandled incoming IMD message #%d. length=%d\n", msg, length); + break; + } + } + } + + /* update all tasks with current settings. */ + int old_imd_forces = imd_forces; + MPI_Bcast(&imd_trate, 1, MPI_INT, 0, world); + MPI_Bcast(&imd_inactive, 1, MPI_INT, 0, world); + MPI_Bcast(&imd_forces, 1, MPI_INT, 0, world); + MPI_Bcast(&imd_terminate, 1, MPI_INT, 0, world); + if (imd_terminate) + error->all(FLERR,"LAMMPS terminated on IMD request."); + + if (imd_forces > 0) { + /* check if we need to readjust the forces comm buffer on the receiving nodes. */ + if (me != 0) { + if (old_imd_forces < imd_forces) { /* grow holding space for forces, if needed. */ + if (recv_force_buf != nullptr) + memory->sfree(recv_force_buf); + recv_force_buf = memory->smalloc((bigint)imd_forces*size_one, "imd:recv_force_buf"); + } + } + MPI_Bcast(recv_force_buf, imd_forces*size_one, MPI_BYTE, 0, world); + } + + /* Check if we need to communicate coordinates to the client. + * Tuning imd_trate allows to keep the overhead for IMD low + * at the expense of a more jumpy display. Rather than using + * end_of_step() we do everything here in one go. + * + * If we don't communicate, only check if we have forces + * stored away and apply them. */ + if (imd_forces > 0) { + buf = static_cast(recv_force_buf); + + /* XXX. this is in principle O(N**2) == not good. + * however we assume for now that the number of atoms + * that we manipulate via IMD will be small compared + * to the total system size, so we don't hurt too much. */ + for (int j=0; j < imd_forces; ++j) { + for (int i=0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + if (buf[j].tag == tag[i]) { + f[i][0] += imd_fscale*buf[j].x; + f[i][1] += imd_fscale*buf[j].y; + f[i][2] += imd_fscale*buf[j].z; + } + } + } + } + } +} + +void FixIMD::handle_output_v3() { + + tagint *tag = atom->tag; + double **x = atom->x; + double **v = atom->v; + double **f = atom->f; + imageint *image = atom->image; + int nlocal = atom->nlocal; + int *mask = atom->mask; + struct commdata *buf; + + // Only main process should use: + float *global_coords = nullptr; + float *global_vel = nullptr; + float *global_force = nullptr; + + // Prepare offsets in outgoing buffer + // Fill what we can wihtout collecting from other processes + if (me == 0) { + int offset = 0; + if (imdsinfo->time) { + imd_fill_header((IMDheader *)msgdata, IMD_TIME, 1); + double dt = update->dt; + + double currtime = update->atime + ((update->ntimestep - update->atimestep) * update->dt); + long long currstep = update->ntimestep; + char *time = (msgdata+IMDHEADERSIZE); + + memcpy(time, &dt, sizeof(dt)); + memcpy(time+sizeof(dt), &currtime, sizeof(currtime)); + memcpy(time+sizeof(dt)+sizeof(currtime), &currstep, sizeof(currstep)); + offset += IMDHEADERSIZE+sizeof(dt)+sizeof(currtime)+sizeof(currstep); + } + if (imdsinfo->box) { + imd_fill_header((IMDheader *)(msgdata + offset), IMD_BOX, 1); + // Get triclinic box vectors + float *box = (float *)(msgdata+offset+IMDHEADERSIZE); + box[0] = domain->h[0]; + box[1] = 0.0; + box[2] = 0.0; + box[3] = domain->h[5]; + box[4] = domain->h[1]; + box[5] = 0.0; + box[6] = domain->h[4]; + box[7] = domain->h[3]; + box[8] = domain->h[2]; + + offset += (9*4)+IMDHEADERSIZE; + + } + if (imdsinfo->coords) { + imd_fill_header((IMDheader *)(msgdata + offset), IMD_FCOORDS, num_coords); + global_coords = (float *) (msgdata + offset + IMDHEADERSIZE); + offset += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->velocities) { + imd_fill_header((IMDheader *)(msgdata + offset), IMD_VELOCITIES, num_coords); + global_vel = (float *) (msgdata + offset + IMDHEADERSIZE); + offset += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->forces) { + imd_fill_header((IMDheader *)(msgdata + offset), IMD_FORCES, num_coords); + global_force = (float *) (msgdata + offset + IMDHEADERSIZE); + offset += 3*4*num_coords+IMDHEADERSIZE; + } + } + + int ntotal, nmax, nme=0; + for (int i=0; i < nlocal; ++i) + if (mask[i] & groupbit) ++nme; + + // Atoms per proc + int *recvcounts = nullptr; + // Displacements in recv for each proc + int *displs = nullptr; + + + if (me == 0) { + recvcounts = new int[comm->nprocs]; + displs = new int[comm->nprocs]; + } + + MPI_Gather(&nme, 1, MPI_INT, recvcounts, 1, MPI_INT, 0, world); + + if (me == 0) { + displs[0] = 0; + ntotal = recvcounts[0]; + for (int i = 1; i < comm->nprocs; ++i) { + displs[i] = displs[i - 1] + recvcounts[i - 1]; + ntotal += recvcounts[i]; + } + } + + if (imdsinfo->coords) { + commdata *recvcoord = nullptr; + memory->destroy(coord_data); + coord_data = memory->smalloc(nme*size_one, "imd:coord_data"); + buf = static_cast(coord_data); + int idx = 0; + if (imdsinfo->unwrap) { + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double xy = domain->xy; + double xz = domain->xz; + double yz = domain->yz; + for (int i = 0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + int ix = (image[i] & IMGMASK) - IMGMAX; + int iy = (image[i] >> IMGBITS & IMGMASK) - IMGMAX; + int iz = (image[i] >> IMG2BITS) - IMGMAX; + + if (domain->triclinic) { + buf[idx].tag = tag[i]; + buf[idx].x = x[i][0]; + ix * xprd + iy * xy + iz * xz; + buf[idx].y = x[i][1]; + iy * yprd + iz * yz; + buf[idx].z = x[i][2]; + iz * zprd; + } + else { + buf[idx].tag = tag[i]; + buf[idx].x = x[i][0]; + ix * xprd; + buf[idx].y = x[i][1]; + iy * yprd; + buf[idx].z = x[i][2]; + iz * zprd; + } + ++idx; + } + } + } + else { + for (int i = 0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + buf[idx].tag = tag[i]; + buf[idx].x = x[i][0]; + buf[idx].y = x[i][1]; + buf[idx].z = x[i][2]; + ++idx; + } + } + } + if (me == 0) { + recvcoord = new commdata[ntotal]; + } + MPI_Gatherv(buf, nme, MPI_CommData, + recvcoord, recvcounts, displs, MPI_CommData, 0, world); + if (me == 0) { + // Sort the coordinates by tag, place in global_coords + for (int i = 0; i < comm->nprocs; ++i) { + for (int j = 0; j < recvcounts[i]; ++j) { + int idx = displs[i]+j; + const tagint t = 3*taginthash_lookup((taginthash_t *)idmap, recvcoord[idx].tag); + if (t != 3*HASH_FAIL) { + global_coords[t] = recvcoord[idx].x; + global_coords[t + 1] = recvcoord[idx].y; + global_coords[t + 2] = recvcoord[idx].z; + } + } + } + } + } + if (imdsinfo->velocities) { + commdata *recvvel = nullptr; + memory->destroy(vel_data); + vel_data = memory->smalloc(nme*size_one, "imd:vel_data"); + buf = static_cast(vel_data); + int idx = 0; + for (int i = 0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + buf[idx].tag = tag[i]; + buf[idx].x = v[i][0]; + buf[idx].y = v[i][1]; + buf[idx].z = v[i][2]; + ++idx; + } + } + if (me == 0) { + recvvel = new commdata[ntotal]; + } + MPI_Gatherv(buf, nme, MPI_CommData, + recvvel, recvcounts, displs, MPI_CommData, 0, world); + if (me == 0) { + // Sort the coordinates by tag, place in global_vels + for (int i = 0; i < comm->nprocs; ++i) { + for (int j = 0; j < recvcounts[i]; ++j) { + int idx = displs[i]+j; + const tagint t = 3*taginthash_lookup((taginthash_t *)idmap, recvvel[idx].tag); + if (t != 3*HASH_FAIL) { + global_vel[t] = recvvel[idx].x; + global_vel[t + 1] = recvvel[idx].y; + global_vel[t + 2] = recvvel[idx].z; + } + } + } + } + } + if (imdsinfo->forces) { + commdata *recvforce = nullptr; + memory->destroy(force_data); + force_data = memory->smalloc(nme*size_one, "imd:force_data"); + buf = static_cast(force_data); + int idx = 0; + for (int i = 0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + buf[idx].tag = tag[i]; + buf[idx].x = f[i][0]; + buf[idx].y = f[i][1]; + buf[idx].z = f[i][2]; + ++idx; + } + } + if (me == 0) { + recvforce = new commdata[ntotal]; + } + MPI_Gatherv(buf, nme, MPI_CommData, + recvforce, recvcounts, displs, MPI_CommData, 0, world); + if (me == 0) { + // Sort the coordinates by tag, place in global_coords + for (int i = 0; i < comm->nprocs; ++i) { + for (int j = 0; j < recvcounts[i]; ++j) { + int idx = displs[i]+j; + const tagint t = 3*taginthash_lookup((taginthash_t *)idmap, recvforce[idx].tag); + if (t != 3*HASH_FAIL) { + global_force[t] = recvforce[idx].x; + global_force[t + 1] = recvforce[idx].y; + global_force[t + 2] = recvforce[idx].z; + } + } + } + } + } + +/* done collecting frame data now communicate with IMD client. */ + +#if defined(LAMMPS_ASYNC_IMD) + /* wake up i/o worker thread and release lock on i/o buffer + * we can go back to our MD and let the i/o thread do the rest */ + buf_has_data=1; + pthread_cond_signal(&write_cond); + pthread_mutex_unlock(&write_mutex); +#else + /* send coordinate data, if client is able to accept */ + if (clientsock && imdsock_selwrite(clientsock,0)) { + imd_writen(clientsock, msgdata, msglen); + } +#endif } /* End of FixIMD class implementation. */ @@ -1401,13 +1964,35 @@ static int32 imd_writen(void *s, const char *ptr, int32 n) { return n; } -int imd_handshake(void *s) { +int imd_handshake_v2(void *s) { IMDheader header; imd_fill_header(&header, IMD_HANDSHAKE, 1); - header.length = IMDVERSION; /* Not byteswapped! */ + header.length = 2; /* Not byteswapped! */ return (imd_writen(s, (char *)&header, IMDHEADERSIZE) != IMDHEADERSIZE); } +int imd_handshake_v3(void *s, IMDSessionInfo *imdsinfo) { + IMDheader header; + imd_fill_header(&header, IMD_HANDSHAKE, 1); + header.length = 3; /* Not byteswapped so client can determine native endinaness */ + + if (imd_writen(s, (char *)&header, IMDHEADERSIZE) != IMDHEADERSIZE) return -1; + + imd_fill_header(&header, IMD_SESSIONINFO, 7); + unsigned char body[7] = {0}; + body[0] = imdsinfo->time; + body[1] = imdsinfo->energies; + body[2] = imdsinfo->box; + body[3] = imdsinfo->coords; + body[4] = !imdsinfo->unwrap; + body[5] = imdsinfo->velocities; + body[6] = imdsinfo->forces; + + if (imd_writen(s, (char *)&header, IMDHEADERSIZE) != IMDHEADERSIZE || + imd_writen(s, (char *)&body, 7) != 7) return -1; + return 0; +} + /* The IMD receive functions */ IMDType imd_recv_header(void *s, int32 *length) { diff --git a/src/MISC/fix_imd.h b/src/MISC/fix_imd.h index 2312540e66..6b8778dbf0 100644 --- a/src/MISC/fix_imd.h +++ b/src/MISC/fix_imd.h @@ -56,6 +56,9 @@ FixStyle(imd,FixIMD); #include #endif +/* IMDv3 session information */ +struct IMDSessionInfo; + /* prototype for c wrapper that calls the real worker */ extern "C" void *fix_imd_ioworker(void *); @@ -69,8 +72,11 @@ class FixIMD : public Fix { void init() override; void setup(int) override; void post_force(int) override; + void end_of_step() override; void post_force_respa(int, int, int) override; double memory_usage() override; + // Fix nevery at 1, use trate to skip in 'end_of_step` + int nevery = 1; protected: int imd_port; @@ -80,13 +86,17 @@ class FixIMD : public Fix { int num_coords; // total number of atoms controlled by this fix int size_one; // bytes per atom in communication buffer. int maxbuf; // size of atom communication buffer. - void *comm_buf; // communication buffer + void *coord_data; // communication buffer for coordinates + void *vel_data; // communication buffer for velocities + void *force_data; // communication buffer for forces void *idmap; // hash for mapping atom indices to consistent order. tagint *rev_idmap; // list of the hash keys for reverse mapping. - int imd_forces; // number of forces communicated via IMD. - void *force_buf; // force data buffer - double imd_fscale; // scale factor for forces. in case VMD's units are off. + int imd_version; // version of the IMD protocol to be used. + + int imd_forces; // number of forces communicated via IMD. + void *recv_force_buf; // force data buffer + double imd_fscale; // scale factor for forces. in case VMD's units are off. int imd_inactive; // true if IMD connection stopped. int imd_terminate; // true if IMD requests termination of run. @@ -96,12 +106,20 @@ class FixIMD : public Fix { int nowait_flag; // true if LAMMPS should not wait with the execution for VMD. int connect_msg; // flag to indicate whether a "listen for connection message" is needed. + /* IMDv3-only */ + IMDSessionInfo *imdsinfo; // session information for IMDv3 + int me; // my MPI rank in this "world". int nlevels_respa; // flag to determine respa levels. int msglen; char *msgdata; + private: + void handle_step_v2(); + void handle_client_input_v3(); + void handle_output_v3(); + #if defined(LAMMPS_ASYNC_IMD) int buf_has_data; // flag to indicate to the i/o thread what to do. pthread_mutex_t write_mutex; // mutex for sending coordinates to i/o thread From 940308ba5918da2d6c0f29a9786630387461d715 Mon Sep 17 00:00:00 2001 From: ljwoods2 Date: Tue, 12 Nov 2024 14:38:41 -0700 Subject: [PATCH 017/161] run CI --- .github/workflows/check-cpp23.yml | 1 - .github/workflows/check-vla.yml | 1 - .github/workflows/codeql-analysis.yml | 1 - .github/workflows/compile-msvc.yml | 1 - .github/workflows/coverity.yml | 1 - .github/workflows/full-regression.yml | 1 - .github/workflows/kokkos-regression.yaml | 1 - .github/workflows/quick-regression.yml | 1 - .github/workflows/style-check.yml | 1 - .github/workflows/unittest-linux.yml | 1 - .github/workflows/unittest-macos.yml | 1 - 11 files changed, 11 deletions(-) diff --git a/.github/workflows/check-cpp23.yml b/.github/workflows/check-cpp23.yml index 2cd53f2208..5022b44e0a 100644 --- a/.github/workflows/check-cpp23.yml +++ b/.github/workflows/check-cpp23.yml @@ -14,7 +14,6 @@ on: jobs: build: name: Build with C++23 support enabled - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/check-vla.yml b/.github/workflows/check-vla.yml index ab89018a3d..af1e269333 100644 --- a/.github/workflows/check-vla.yml +++ b/.github/workflows/check-vla.yml @@ -14,7 +14,6 @@ on: jobs: build: name: Build with -Werror=vla - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index c7dd945f5f..47fdae311f 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -10,7 +10,6 @@ on: jobs: analyze: name: Analyze - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest permissions: diff --git a/.github/workflows/compile-msvc.yml b/.github/workflows/compile-msvc.yml index 7560bc0549..4f04979925 100644 --- a/.github/workflows/compile-msvc.yml +++ b/.github/workflows/compile-msvc.yml @@ -18,7 +18,6 @@ concurrency: jobs: build: name: Windows Compilation Test - if: ${{ github.repository == 'lammps/lammps' }} runs-on: windows-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index c0c3e3f89a..7e38643b4c 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -9,7 +9,6 @@ on: jobs: analyze: name: Analyze - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest container: image: lammps/buildenv:ubuntu20.04 diff --git a/.github/workflows/full-regression.yml b/.github/workflows/full-regression.yml index a6b5353b9b..683f65409d 100644 --- a/.github/workflows/full-regression.yml +++ b/.github/workflows/full-regression.yml @@ -12,7 +12,6 @@ jobs: build: name: Build LAMMPS # restrict to official LAMMPS repository - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/kokkos-regression.yaml b/.github/workflows/kokkos-regression.yaml index 0756b080b0..6238f15c93 100644 --- a/.github/workflows/kokkos-regression.yaml +++ b/.github/workflows/kokkos-regression.yaml @@ -12,7 +12,6 @@ jobs: build: name: Build LAMMPS with Kokkos OpenMP # restrict to official LAMMPS repository - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/quick-regression.yml b/.github/workflows/quick-regression.yml index 88794bfa0a..5325e4b6cb 100644 --- a/.github/workflows/quick-regression.yml +++ b/.github/workflows/quick-regression.yml @@ -16,7 +16,6 @@ jobs: build: name: Build LAMMPS # restrict to official LAMMPS repository - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml index 7be2c4fc46..e97163269c 100644 --- a/.github/workflows/style-check.yml +++ b/.github/workflows/style-check.yml @@ -18,7 +18,6 @@ concurrency: jobs: build: name: Programming Style Conformance - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/unittest-linux.yml b/.github/workflows/unittest-linux.yml index ce98fcea35..04b1d1f328 100644 --- a/.github/workflows/unittest-linux.yml +++ b/.github/workflows/unittest-linux.yml @@ -18,7 +18,6 @@ concurrency: jobs: build: name: Linux Unit Test - if: ${{ github.repository == 'lammps/lammps' }} runs-on: ubuntu-latest env: CCACHE_DIR: ${{ github.workspace }}/.ccache diff --git a/.github/workflows/unittest-macos.yml b/.github/workflows/unittest-macos.yml index 0d478a9d6b..d3e242774c 100644 --- a/.github/workflows/unittest-macos.yml +++ b/.github/workflows/unittest-macos.yml @@ -18,7 +18,6 @@ concurrency: jobs: build: name: MacOS Unit Test - if: ${{ github.repository == 'lammps/lammps' }} runs-on: macos-13 env: CCACHE_DIR: ${{ github.workspace }}/.ccache From f7915109f97038e88a1ed35f2a801411339f55d7 Mon Sep 17 00:00:00 2001 From: ljwoods2 Date: Mon, 25 Nov 2024 15:22:02 -0700 Subject: [PATCH 018/161] minor bug fixes --- src/MISC/fix_imd.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/MISC/fix_imd.cpp b/src/MISC/fix_imd.cpp index c295fdfaf8..9192f8a24c 100644 --- a/src/MISC/fix_imd.cpp +++ b/src/MISC/fix_imd.cpp @@ -456,7 +456,7 @@ MPI_Datatype MPI_CommData; /*************************************************************** * create class and parse arguments in LAMMPS script. Syntax: - * fix ID group-ID imd [version (2|3)] [unwrap (on|off)] [fscale ] [time (on|off)] [box (on|off)] [coordinates (on|off)] [velocities (on|off)] [forces (on|off)] + * fix ID group-ID imd [trate ] [version (2|3)] [unwrap (on|off)] [fscale ] [time (on|off)] [box (on|off)] [coordinates (on|off)] [velocities (on|off)] [forces (on|off)] ***************************************************************/ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) @@ -917,8 +917,10 @@ void FixIMD::ioworker() pthread_exit(nullptr); } else if (buf_has_data > 0) { /* send coordinate data, if client is able to accept */ - if (clientsock && imdsock_selwrite(clientsock,0)) { - imd_writen(clientsock, msgdata, msglen); + if (imd_writen(clientsock, msgdata, msglen) != msglen) { + fprintf(screen,"Asynchronous I/O thread terminated on error in sending IMDFrame.\n"); + pthread_mutex_unlock(&write_mutex); + pthread_exit(nullptr); } delete[] msgdata; buf_has_data=0; @@ -1226,8 +1228,8 @@ void FixIMD::handle_step_v2() { pthread_mutex_unlock(&write_mutex); #else /* send coordinate data, if client is able to accept */ - if (clientsock && imdsock_selwrite(clientsock,0)) { - imd_writen(clientsock, msgdata, msglen); + if (imd_writen(clientsock, msgdata, msglen) != msglen) { + error->all(FLERR, "LAMMPS terminated on error in sending IMDFrame"); } #endif @@ -1703,9 +1705,9 @@ void FixIMD::handle_output_v3() { pthread_cond_signal(&write_cond); pthread_mutex_unlock(&write_mutex); #else - /* send coordinate data, if client is able to accept */ - if (clientsock && imdsock_selwrite(clientsock,0)) { - imd_writen(clientsock, msgdata, msglen); + /* send IMDFrame data, blocking until client accepts */ + if (imd_writen(clientsock, msgdata, msglen) != msglen) { + error->all(FLERR, "LAMMPS terminated on error in sending IMDFrame"); } #endif } From 88807c6ae63fe50bbb82864885c6ebd0af42c897 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Mon, 25 Nov 2024 16:18:50 -0700 Subject: [PATCH 019/161] ml-pace: allow linking to existing pace library --- cmake/Modules/Packages/ML-PACE.cmake | 92 +++++++++++++++------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake index 8660898138..7093626abd 100644 --- a/cmake/Modules/Packages/ML-PACE.cmake +++ b/cmake/Modules/Packages/ML-PACE.cmake @@ -1,50 +1,56 @@ # PACE library support for ML-PACE package +find_package(pace QUIET) -# set policy to silence warnings about timestamps of downloaded files. review occasionally if it may be set to NEW -if(POLICY CMP0135) - cmake_policy(SET CMP0135 OLD) -endif() - -set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.11.25.fix.tar.gz" CACHE STRING "URL for PACE evaluator library sources") -set(PACELIB_MD5 "b45de9a633f42ed65422567e3ce56f9f" CACHE STRING "MD5 checksum of PACE evaluator library tarball") -mark_as_advanced(PACELIB_URL) -mark_as_advanced(PACELIB_MD5) -GetFallbackURL(PACELIB_URL PACELIB_FALLBACK) - -# LOCAL_ML-PACE points to top-level dir with local lammps-user-pace repo, -# to make it easier to check local build without going through the public github releases -if(LOCAL_ML-PACE) - set(lib-pace "${LOCAL_ML-PACE}") +if(pace_FOUND) + find_package(pace) + target_link_libraries(lammps PRIVATE pace::pace) else() - # download library sources to build folder - if(EXISTS ${CMAKE_BINARY_DIR}/libpace.tar.gz) - file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) - endif() - if(NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}") - message(STATUS "Downloading ${PACELIB_URL}") - file(DOWNLOAD ${PACELIB_URL} ${CMAKE_BINARY_DIR}/libpace.tar.gz STATUS DL_STATUS SHOW_PROGRESS) - file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) - if((NOT DL_STATUS EQUAL 0) OR (NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}")) - message(WARNING "Download from primary URL ${PACELIB_URL} failed\nTrying fallback URL ${PACELIB_FALLBACK}") - file(DOWNLOAD ${PACELIB_FALLBACK} ${CMAKE_BINARY_DIR}/libpace.tar.gz EXPECTED_HASH MD5=${PACELIB_MD5} SHOW_PROGRESS) + # set policy to silence warnings about timestamps of downloaded files. review occasionally if it may be set to NEW + if(POLICY CMP0135) + cmake_policy(SET CMP0135 OLD) endif() - else() - message(STATUS "Using already downloaded archive ${CMAKE_BINARY_DIR}/libpace.tar.gz") - endif() + + set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.11.25.fix.tar.gz" CACHE STRING "URL for PACE evaluator library sources") + set(PACELIB_MD5 "b45de9a633f42ed65422567e3ce56f9f" CACHE STRING "MD5 checksum of PACE evaluator library tarball") + mark_as_advanced(PACELIB_URL) + mark_as_advanced(PACELIB_MD5) + GetFallbackURL(PACELIB_URL PACELIB_FALLBACK) + + # LOCAL_ML-PACE points to top-level dir with local lammps-user-pace repo, + # to make it easier to check local build without going through the public github releases + if(LOCAL_ML-PACE) + set(lib-pace "${LOCAL_ML-PACE}") + else() + # download library sources to build folder + if(EXISTS ${CMAKE_BINARY_DIR}/libpace.tar.gz) + file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) + endif() + if(NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}") + message(STATUS "Downloading ${PACELIB_URL}") + file(DOWNLOAD ${PACELIB_URL} ${CMAKE_BINARY_DIR}/libpace.tar.gz STATUS DL_STATUS SHOW_PROGRESS) + file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) + if((NOT DL_STATUS EQUAL 0) OR (NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}")) + message(WARNING "Download from primary URL ${PACELIB_URL} failed\nTrying fallback URL ${PACELIB_FALLBACK}") + file(DOWNLOAD ${PACELIB_FALLBACK} ${CMAKE_BINARY_DIR}/libpace.tar.gz EXPECTED_HASH MD5=${PACELIB_MD5} SHOW_PROGRESS) + endif() + else() + message(STATUS "Using already downloaded archive ${CMAKE_BINARY_DIR}/libpace.tar.gz") + endif() - # uncompress downloaded sources - execute_process( - COMMAND ${CMAKE_COMMAND} -E remove_directory lammps-user-pace* - COMMAND ${CMAKE_COMMAND} -E tar xzf libpace.tar.gz - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) - get_newest_file(${CMAKE_BINARY_DIR}/lammps-user-pace-* lib-pace) -endif() - -add_subdirectory(${lib-pace} build-pace) -set_target_properties(pace PROPERTIES CXX_EXTENSIONS ON OUTPUT_NAME lammps_pace${LAMMPS_MACHINE}) - -if(CMAKE_PROJECT_NAME STREQUAL "lammps") - target_link_libraries(lammps PRIVATE pace) + # uncompress downloaded sources + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove_directory lammps-user-pace* + COMMAND ${CMAKE_COMMAND} -E tar xzf libpace.tar.gz + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + ) + get_newest_file(${CMAKE_BINARY_DIR}/lammps-user-pace-* lib-pace) + endif() + + add_subdirectory(${lib-pace} build-pace) + set_target_properties(pace PROPERTIES CXX_EXTENSIONS ON OUTPUT_NAME lammps_pace${LAMMPS_MACHINE}) + + if(CMAKE_PROJECT_NAME STREQUAL "lammps") + target_link_libraries(lammps PRIVATE pace) + endif() endif() From aeb9003890c866994a916ea89e278b7982ce7fbe Mon Sep 17 00:00:00 2001 From: ljwoods2 Date: Mon, 25 Nov 2024 18:22:44 -0700 Subject: [PATCH 020/161] bug fixes --- src/MISC/fix_imd.cpp | 218 +++++++++++++++++++++++++++++-------------- src/MISC/fix_imd.h | 2 + 2 files changed, 152 insertions(+), 68 deletions(-) diff --git a/src/MISC/fix_imd.cpp b/src/MISC/fix_imd.cpp index 9192f8a24c..6f2bb53a8c 100644 --- a/src/MISC/fix_imd.cpp +++ b/src/MISC/fix_imd.cpp @@ -76,7 +76,6 @@ negotiate an appropriate license for such distribution." #endif #include -#include using namespace LAMMPS_NS; using namespace FixConst; @@ -461,7 +460,6 @@ MPI_Datatype MPI_CommData; FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { - if (narg < 4) error->all(FLERR,"Illegal fix imd command"); @@ -508,7 +506,9 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : vel_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); } else if (0 == strcmp(arg[iarg], "forces")) { force_flag = utils::logical(FLERR, arg[iarg+1], false, lmp); - } else error->all(FLERR,"Unknown fix imd parameter"); + } else { + error->all(FLERR,"Unknown fix imd parameter"); + } ++iarg; ++iarg; } @@ -542,7 +542,6 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : imdsinfo->energies = false; } - bigint n = group->count(igroup); if (n > MAXSMALLINT) error->all(FLERR,"Too many atoms for fix imd"); num_coords = static_cast (n); @@ -568,25 +567,30 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : idmap = nullptr; rev_idmap = nullptr; + if (imd_version == 3) { + msglen = 0; + if (imdsinfo->time) { + msglen += 24+IMDHEADERSIZE; + } + if (imdsinfo->box) { + msglen += 9*4+IMDHEADERSIZE; + } + if (imdsinfo->coords) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->velocities) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + if (imdsinfo->forces) { + msglen += 3*4*num_coords+IMDHEADERSIZE; + } + msgdata = new char[msglen]; + } + else { + msglen = 3*sizeof(float)*num_coords+IMDHEADERSIZE; + msgdata = new char[msglen]; + } - msglen = 0; - if (imdsinfo->time) { - msglen += 24+IMDHEADERSIZE; - } - if (imdsinfo->box) { - msglen += 9*4+IMDHEADERSIZE; - } - if (imdsinfo->coords) { - msglen += 3*4*num_coords+IMDHEADERSIZE; - } - if (imdsinfo->velocities) { - msglen += 3*4*num_coords+IMDHEADERSIZE; - } - if (imdsinfo->forces) { - msglen += 3*4*num_coords+IMDHEADERSIZE; - } - msgdata = new char[msglen]; - if (me == 0) { /* set up incoming socket on MPI rank 0. */ imdsock_init(); @@ -638,6 +642,7 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) : *********************************/ FixIMD::~FixIMD() { + #if defined(LAMMPS_ASYNC_IMD) if (me == 0) { pthread_mutex_lock(&write_mutex); @@ -657,12 +662,12 @@ FixIMD::~FixIMD() memory->destroy(vel_data); memory->destroy(force_data); - memory->destroy(msgdata); + delete[] msgdata; memory->destroy(recv_force_buf); taginthash_destroy(hashtable); delete hashtable; free(rev_idmap); - free(imdsinfo); + delete imdsinfo; // close sockets imdsock_shutdown(clientsock); imdsock_destroy(clientsock); @@ -763,6 +768,103 @@ int FixIMD::reconnect() /* wait for IMD client (e.g. VMD) to respond, initialize communication * buffers and collect tag/id maps. */ void FixIMD::setup(int) +{ + if (imd_version == 2) { + setup_v2(); + } + else { + setup_v3(); + } +} + +void FixIMD::setup_v2() { + /* nme: number of atoms in group on this MPI task + * nmax: max number of atoms in group across all MPI tasks + * nlocal: all local atoms + */ + int i,j; + int nmax,nme,nlocal; + int *mask = atom->mask; + tagint *tag = atom->tag; + nlocal = atom->nlocal; + nme=0; + for (i=0; i < nlocal; ++i) + if (mask[i] & groupbit) ++nme; + + MPI_Allreduce(&nme,&nmax,1,MPI_INT,MPI_MAX,world); + memory->destroy(coord_data); + maxbuf = nmax*size_one; + coord_data = (void *) memory->smalloc(maxbuf,"imd:coord_data"); + + connect_msg = 1; + reconnect(); + MPI_Bcast(&imd_inactive, 1, MPI_INT, 0, world); + MPI_Bcast(&imd_terminate, 1, MPI_INT, 0, world); + if (imd_terminate) + error->all(FLERR,"LAMMPS terminated on error in setting up IMD connection."); + + /* initialize and build hashtable. */ + auto hashtable=new taginthash_t; + taginthash_init(hashtable, num_coords); + idmap = (void *)hashtable; + + int tmp, ndata; + auto buf = static_cast(coord_data); + + if (me == 0) { + MPI_Status status; + MPI_Request request; + auto taglist = new tagint[num_coords]; + int numtag=0; /* counter to map atom tags to a 0-based consecutive index list */ + + for (i=0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + taglist[numtag] = tag[i]; + ++numtag; + } + } + + /* loop over procs to receive remote data */ + for (i=1; i < comm->nprocs; ++i) { + MPI_Irecv(coord_data, maxbuf, MPI_BYTE, i, 0, world, &request); + MPI_Send(&tmp, 0, MPI_INT, i, 0, world); + MPI_Wait(&request, &status); + MPI_Get_count(&status, MPI_BYTE, &ndata); + ndata /= size_one; + + for (j=0; j < ndata; ++j) { + taglist[numtag] = buf[j].tag; + ++numtag; + } + } + + /* sort list of tags by value to have consistently the + * same list when running in parallel and build hash table. */ + id_sort(taglist, 0, num_coords-1); + for (i=0; i < num_coords; ++i) { + taginthash_insert(hashtable, taglist[i], i); + } + delete[] taglist; + + /* generate reverse index-to-tag map for communicating + * IMD forces back to the proper atoms */ + rev_idmap=taginthash_keys(hashtable); + } else { + nme=0; + for (i=0; i < nlocal; ++i) { + if (mask[i] & groupbit) { + buf[nme].tag = tag[i]; + ++nme; + } + } + /* blocking receive to wait until it is our turn to send data. */ + MPI_Recv(&tmp, 0, MPI_INT, 0, 0, world, MPI_STATUS_IGNORE); + MPI_Rsend(coord_data, nme*size_one, MPI_BYTE, 0, 0, world); + } + + } + +void FixIMD::setup_v3() { /* nme: number of atoms in group on this MPI task * nmax: max number of atoms in group across all MPI tasks @@ -807,11 +909,24 @@ void FixIMD::setup(int) idmap = (void *)hashtable; int tmp, ndata; - auto buf = static_cast(coord_data); + + struct commdata *buf = nullptr; + if (imdsinfo->coords) { + buf = static_cast(coord_data); + } + else if (imdsinfo->velocities) { + buf = static_cast(vel_data); + } + else if (imdsinfo->forces) { + buf = static_cast(force_data); + } if (me == 0) { - std::vector statuses; - std::vector requests; + if (buf == nullptr) { + return; + } + MPI_Status status; + MPI_Request request; auto taglist = new tagint[num_coords]; int numtag=0; /* counter to map atom tags to a 0-based consecutive index list */ @@ -824,39 +939,15 @@ void FixIMD::setup(int) /* loop over procs to receive remote data */ for (i=1; i < comm->nprocs; ++i) { - /* We're assuming tags are consistent across x,v,f */ - bool tag_recvd = false; - statuses.clear(); - requests.clear(); - - if (imdsinfo->coords) { - requests.push_back(MPI_Request()); - MPI_Irecv(coord_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); - } - if (imdsinfo->velocities) { - requests.push_back(MPI_Request()); - MPI_Irecv(vel_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); - } - if (imdsinfo->forces) { - requests.push_back(MPI_Request()); - MPI_Irecv(vel_data, maxbuf, MPI_BYTE, i, 0, world, &requests.back()); - } - statuses.resize(requests.size()); + MPI_Irecv(coord_data, maxbuf, MPI_BYTE, i, 0, world, &request); MPI_Send(&tmp, 0, MPI_INT, i, 0, world); - MPI_Waitall(requests.size(), requests.data(), statuses.data()); + MPI_Wait(&request, &status); + MPI_Get_count(&status, MPI_BYTE, &ndata); + ndata /= size_one; - for (size_t k=0; k < statuses.size(); ++k) { - if (!tag_recvd) { - MPI_Get_count(&statuses[k], MPI_BYTE, &ndata); - ndata /= size_one; - for (j=0; j < ndata; ++j) { - taglist[numtag] = buf[j].tag; - ++numtag; - } - tag_recvd = true; - } else { - break; - } + for (j=0; j < ndata; ++j) { + taglist[numtag] = buf[j].tag; + ++numtag; } } @@ -881,19 +972,10 @@ void FixIMD::setup(int) } /* blocking receive to wait until it is our turn to send data. */ MPI_Recv(&tmp, 0, MPI_INT, 0, 0, world, MPI_STATUS_IGNORE); - if (imdsinfo->coords) { - MPI_Rsend(coord_data, nme*size_one, MPI_BYTE, 0, 0, world); - } - if (imdsinfo->velocities) { - MPI_Rsend(vel_data, nme*size_one, MPI_BYTE, 0, 0, world); - } - if (imdsinfo->forces) { - MPI_Rsend(force_data, nme*size_one, MPI_BYTE, 0, 0, world); - } + MPI_Rsend(coord_data, nme*size_one, MPI_BYTE, 0, 0, world); } } - /* worker threads for asynchronous i/o */ #if defined(LAMMPS_ASYNC_IMD) /* c bindings wrapper */ diff --git a/src/MISC/fix_imd.h b/src/MISC/fix_imd.h index 6b8778dbf0..03d242f32b 100644 --- a/src/MISC/fix_imd.h +++ b/src/MISC/fix_imd.h @@ -116,6 +116,8 @@ class FixIMD : public Fix { char *msgdata; private: + void setup_v2(); + void setup_v3(); void handle_step_v2(); void handle_client_input_v3(); void handle_output_v3(); From f0c176c6033b14c176da6fd1dff8d401f8939314 Mon Sep 17 00:00:00 2001 From: Tyler Collins Date: Fri, 6 Dec 2024 21:42:23 -0800 Subject: [PATCH 021/161] vcm example script and log added --- examples/vcm/in.vcm.lmp | 82 +++++++++++ examples/vcm/log.19Nov24.vcm.g++.1 | 223 +++++++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 examples/vcm/in.vcm.lmp create mode 100644 examples/vcm/log.19Nov24.vcm.g++.1 diff --git a/examples/vcm/in.vcm.lmp b/examples/vcm/in.vcm.lmp new file mode 100644 index 0000000000..41d7665c49 --- /dev/null +++ b/examples/vcm/in.vcm.lmp @@ -0,0 +1,82 @@ +# Removing Binned Center-of-Mass Velocities from Stress Compute + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +processors 1 * * + +# Defining regions for box and atoms + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +create_atoms 1 region box2 + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 + +# Chunk and stress along x direction + +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) + +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced +compute ch_temp_vcm all temp/chunk ch_id com yes +compute atom_stress_vcm all stress/atom ch_temp_vcm +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# Output stress profile in x direction + +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 + +unfix fix_piston + +run 1500 diff --git a/examples/vcm/log.19Nov24.vcm.g++.1 b/examples/vcm/log.19Nov24.vcm.g++.1 new file mode 100644 index 0000000000..ca63b9b5cc --- /dev/null +++ b/examples/vcm/log.19Nov24.vcm.g++.1 @@ -0,0 +1,223 @@ +LAMMPS (19 Nov 2024) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +Loaded 1 plugins from C:\Users\tcollins7472\AppData\Local\LAMMPS 64-bit 19Nov2024-MSMPI with Python\plugins +# Removing Binned Center-of-Mass Velocities from Stress Compute + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +Lattice spacing in x,y,z = 5.3589 5.3589 5.3589 +processors 1 * * + +# Defining regions for box and atoms + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +Created orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + 1 by 1 by 1 MPI processor grid +create_atoms 1 region box2 +Created 7200 atoms + using lattice units in orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + create_atoms CPU = 0.003 seconds + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 12 + ghost atom cutoff = 12 + binsize = 6, bins = 25 11 11 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.721 | 5.721 | 5.721 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 600 -2252.7567 0 -1694.4304 -974.62456 + 100 284.72172 -1977.4291 0 -1712.483 2453.7429 + 200 304.44519 -1994.7937 0 -1711.4941 1822.2699 + 300 304.28012 -1993.2958 0 -1710.1498 1498.3794 + 400 296.76492 -1985.1364 0 -1708.9836 1259.9474 + 500 295.00895 -1982.4224 0 -1707.9036 964.9526 +Loop time of 3.07247 on 1 procs for 500 steps with 7200 atoms + +Performance: 28.121 ns/day, 0.853 hours/ns, 162.736 timesteps/s, 1.172 Matom-step/s +99.7% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 2.8976 | 2.8976 | 2.8976 | 0.0 | 94.31 +Neigh | 0.11531 | 0.11531 | 0.11531 | 0.0 | 3.75 +Comm | 0.015167 | 0.015167 | 0.015167 | 0.0 | 0.49 +Output | 0.003809 | 0.003809 | 0.003809 | 0.0 | 0.12 +Modify | 0.025306 | 0.025306 | 0.025306 | 0.0 | 0.82 +Other | | 0.01525 | | | 0.50 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6410 ave 6410 max 6410 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 615095 ave 615095 max 615095 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 615095 +Ave neighs/atom = 85.429861 +Neighbor list builds = 9 +Dangerous builds = 0 + +# Chunk and stress along x direction + +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +variable volfrac equal 1/(vol*0.05) + +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced +compute ch_id all chunk/atom bin/1d x lower 0.05 units reduced +compute ch_temp_vcm all temp/chunk ch_id com yes +compute atom_stress_vcm all stress/atom ch_temp_vcm +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) +variable stress atom -(c_atom_stress_vcm[1])/(vol*0.05) +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# Output stress profile in x direction + +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +863 atoms in group piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: One or more atoms are time integrated more than once (src/modify.cpp:296) +Per MPI rank memory allocation (min/avg/max) = 6.975 | 6.975 | 6.975 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 500 295.00895 274.51875 -1982.4224 144.6903 64.3068 64.3068 631.89976 1127.2965 1135.6616 -1707.9036 + 600 357.38902 332.56613 -1951.3422 144.6903 64.3068 64.3068 2236.6706 2003.2726 1943.6815 -1618.7761 + 700 420.30268 391.11005 -1911.8178 144.6903 64.3068 64.3068 3761.5011 3065.4699 3140.3169 -1520.7077 + 800 484.96279 451.27911 -1875.379 144.6903 64.3068 64.3068 5362.254 4174.4201 4166.0818 -1424.0999 + 900 587.78954 546.96391 -1871.217 144.6903 64.3068 64.3068 6481.4714 4875.705 4676.6083 -1324.2531 + 1000 684.07997 636.56636 -1868.1639 144.6903 64.3068 64.3068 7734.6158 5271.3524 5272.1276 -1231.5975 +Loop time of 3.32746 on 1 procs for 500 steps with 7200 atoms + +Performance: 25.966 ns/day, 0.924 hours/ns, 150.265 timesteps/s, 1.082 Matom-step/s +100.0% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.0701 | 3.0701 | 3.0701 | 0.0 | 92.27 +Neigh | 0.20567 | 0.20567 | 0.20567 | 0.0 | 6.18 +Comm | 0.010313 | 0.010313 | 0.010313 | 0.0 | 0.31 +Output | 0.002649 | 0.002649 | 0.002649 | 0.0 | 0.08 +Modify | 0.029567 | 0.029567 | 0.029567 | 0.0 | 0.89 +Other | | 0.009157 | | | 0.28 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6409 ave 6409 max 6409 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 646408 ave 646408 max 646408 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 646408 +Ave neighs/atom = 89.778889 +Neighbor list builds = 15 +Dangerous builds = 0 + +unfix fix_piston + +run 1500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 6.6 | 6.6 | 6.6 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 1000 684.07997 636.56636 -1868.1639 144.6903 64.3068 64.3068 7734.6158 5271.3524 5272.1276 -1231.5975 + 1100 710.19886 660.87113 -1894.0485 144.6903 64.3068 64.3068 8048.3485 5396.6668 5376.5956 -1233.1774 + 1200 717.16487 667.35331 -1901.3849 144.6903 64.3068 64.3068 8009.7984 5634.5121 5349.4113 -1234.0316 + 1300 710.26037 660.92837 -1894.9802 144.6903 64.3068 64.3068 8063.4125 5572.1245 5530.174 -1234.0519 + 1400 715.93921 666.21278 -1898.8885 144.6903 64.3068 64.3068 7752.0927 5293.5463 5322.2312 -1232.6757 + 1500 748.85411 696.84154 -1926.4891 144.6903 64.3068 64.3068 6030.5428 4076.8886 4012.7653 -1229.6475 + 1600 767.98982 714.64815 -1939.8556 144.6903 64.3068 64.3068 4200.3475 2532.5711 2530.5518 -1225.2075 + 1700 757.22042 704.62675 -1925.553 144.6903 64.3068 64.3068 2686.7843 1482.2796 1505.8073 -1220.9262 + 1800 727.30327 676.78754 -1894.6635 144.6903 64.3068 64.3068 1764.2793 781.37451 801.18668 -1217.8759 + 1900 688.82146 640.97853 -1856.5007 144.6903 64.3068 64.3068 1022.805 417.32394 359.74951 -1215.5221 + 2000 655.91228 610.35509 -1823.954 144.6903 64.3068 64.3068 551.98825 -20.148643 -56.976652 -1213.5989 + 2100 620.22468 577.14622 -1789.1761 144.6903 64.3068 64.3068 264.05975 -266.8323 -314.45533 -1212.0299 + 2200 589.13325 548.21428 -1758.9252 144.6903 64.3068 64.3068 41.369707 -533.503 -525.69401 -1210.7109 + 2300 563.20394 524.08593 -1733.6036 144.6903 64.3068 64.3068 -220.99189 -810.90513 -774.65084 -1209.5176 + 2400 540.44236 502.90528 -1711.3384 144.6903 64.3068 64.3068 -358.01508 -962.31635 -977.3253 -1208.4332 + 2500 523.5718 487.20648 -1694.7088 144.6903 64.3068 64.3068 -521.87444 -1152.8386 -1231.7615 -1207.5023 +Loop time of 9.89185 on 1 procs for 1500 steps with 7200 atoms + +Performance: 26.203 ns/day, 0.916 hours/ns, 151.640 timesteps/s, 1.092 Matom-step/s +98.7% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 8.9768 | 8.9768 | 8.9768 | 0.0 | 90.75 +Neigh | 0.78114 | 0.78114 | 0.78114 | 0.0 | 7.90 +Comm | 0.035178 | 0.035178 | 0.035178 | 0.0 | 0.36 +Output | 0.009593 | 0.009593 | 0.009593 | 0.0 | 0.10 +Modify | 0.057521 | 0.057521 | 0.057521 | 0.0 | 0.58 +Other | | 0.0316 | | | 0.32 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6380 ave 6380 max 6380 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 515773 ave 515773 max 515773 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 515773 +Ave neighs/atom = 71.635139 +Neighbor list builds = 57 +Dangerous builds = 0 +Total wall time: 0:00:16 From 3c6f4374ee4df8706474c45754267eb73fc877ab Mon Sep 17 00:00:00 2001 From: Tyler Collins Date: Fri, 6 Dec 2024 22:37:28 -0800 Subject: [PATCH 022/161] removed some id info --- examples/vcm/log.19Nov24.vcm.g++.1 | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/vcm/log.19Nov24.vcm.g++.1 b/examples/vcm/log.19Nov24.vcm.g++.1 index ca63b9b5cc..4b03a771ad 100644 --- a/examples/vcm/log.19Nov24.vcm.g++.1 +++ b/examples/vcm/log.19Nov24.vcm.g++.1 @@ -1,7 +1,6 @@ LAMMPS (19 Nov 2024) OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) using 1 OpenMP thread(s) per MPI task -Loaded 1 plugins from C:\Users\tcollins7472\AppData\Local\LAMMPS 64-bit 19Nov2024-MSMPI with Python\plugins # Removing Binned Center-of-Mass Velocities from Stress Compute units metal From 9da58b3ffcc0d9133a06508ab724f230544d0af1 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Mon, 4 Nov 2024 08:39:11 -0700 Subject: [PATCH 023/161] python: deprecated pylammps interface --- python/lammps/pylammps.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py index 1f5a1a0db9..cf5a2dc054 100644 --- a/python/lammps/pylammps.py +++ b/python/lammps/pylammps.py @@ -428,6 +428,8 @@ class PyLammps(object): lower-level interface. The original interface can still be accessed via :py:attr:`PyLammps.lmp`. + .. deprecated:: TBA + :param name: "machine" name of the shared LAMMPS library ("mpi" loads ``liblammps_mpi.so``, "" loads ``liblammps.so``) :type name: string :param cmdargs: list of command line arguments to be passed to the :cpp:func:`lammps_open` function. The executable name is automatically added. @@ -447,6 +449,12 @@ class PyLammps(object): """ def __init__(self, name="", cmdargs=None, ptr=None, comm=None, verbose=False): + print("WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING") + print() + print("The PyLammps interface is deprecated and will be removed in future versions.") + print("Please use the lammps Python class instead.") + print() + print("WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING-WARNING") self.has_echo = False self.verbose = verbose From b4acfd1e3b4ac78099cd93845973813873e0534f Mon Sep 17 00:00:00 2001 From: Tyler Collins Date: Tue, 10 Dec 2024 21:47:30 -0800 Subject: [PATCH 024/161] updated summary, comments, and name --- examples/stress_vcm/in.stress_vcm.lmp | 113 ++++++++ .../log.19Nov24.stress_vcm.g++.1} | 99 ++++--- .../stress_vcm/log.19Nov24.stress_vcm.g++.4 | 253 ++++++++++++++++++ examples/vcm/in.vcm.lmp | 82 ------ 4 files changed, 431 insertions(+), 116 deletions(-) create mode 100644 examples/stress_vcm/in.stress_vcm.lmp rename examples/{vcm/log.19Nov24.vcm.g++.1 => stress_vcm/log.19Nov24.stress_vcm.g++.1} (73%) create mode 100644 examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 delete mode 100644 examples/vcm/in.vcm.lmp diff --git a/examples/stress_vcm/in.stress_vcm.lmp b/examples/stress_vcm/in.stress_vcm.lmp new file mode 100644 index 0000000000..24dc86f6fd --- /dev/null +++ b/examples/stress_vcm/in.stress_vcm.lmp @@ -0,0 +1,113 @@ +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +processors 1 * * + +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +create_atoms 1 region box2 + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 + +#--------------------------------------# +# Chunk, Stress, and VCM removal steps # +#--------------------------------------# + +# 1. Create 20 equispaced bins sliced along the x direction. +# -"units reduced" normalizes the distance from 0.0 to 1.0 +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. +compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# -The velocities from specified temp-ID are used to compute stress. +# -Stress/atom units are pressure*volume! Optionally handled next step. +compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) + +# 5. Sum the per atom stresses in each bin. +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# 6. Average and output to file. +# -The average output is every 100 steps with samples collected 20 times with 5 step intervals. +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +#--------------------------------------# + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 + +unfix fix_piston + +run 1500 diff --git a/examples/vcm/log.19Nov24.vcm.g++.1 b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 similarity index 73% rename from examples/vcm/log.19Nov24.vcm.g++.1 rename to examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 index 4b03a771ad..93ae029334 100644 --- a/examples/vcm/log.19Nov24.vcm.g++.1 +++ b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 @@ -1,7 +1,20 @@ LAMMPS (19 Nov 2024) OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) using 1 OpenMP thread(s) per MPI task -# Removing Binned Center-of-Mass Velocities from Stress Compute +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. units metal boundary p p p @@ -10,7 +23,10 @@ lattice fcc 5.3589 Lattice spacing in x,y,z = 5.3589 5.3589 5.3589 processors 1 * * -# Defining regions for box and atoms +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. region box1 block -3 24 0 12 0 12 units lattice region box2 block 0 12 0 12 0 12 units lattice @@ -23,7 +39,7 @@ Created orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) create_atoms 1 region box2 Created 7200 atoms using lattice units in orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) - create_atoms CPU = 0.003 seconds + create_atoms CPU = 0.002 seconds mass 1 40.00 @@ -64,20 +80,20 @@ Per MPI rank memory allocation (min/avg/max) = 5.721 | 5.721 | 5.721 Mbytes 300 304.28012 -1993.2958 0 -1710.1498 1498.3794 400 296.76492 -1985.1364 0 -1708.9836 1259.9474 500 295.00895 -1982.4224 0 -1707.9036 964.9526 -Loop time of 3.07247 on 1 procs for 500 steps with 7200 atoms +Loop time of 3.01696 on 1 procs for 500 steps with 7200 atoms -Performance: 28.121 ns/day, 0.853 hours/ns, 162.736 timesteps/s, 1.172 Matom-step/s -99.7% CPU use with 1 MPI tasks x 1 OpenMP threads +Performance: 28.638 ns/day, 0.838 hours/ns, 165.730 timesteps/s, 1.193 Matom-step/s +99.4% CPU use with 1 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total --------------------------------------------------------------- -Pair | 2.8976 | 2.8976 | 2.8976 | 0.0 | 94.31 -Neigh | 0.11531 | 0.11531 | 0.11531 | 0.0 | 3.75 -Comm | 0.015167 | 0.015167 | 0.015167 | 0.0 | 0.49 -Output | 0.003809 | 0.003809 | 0.003809 | 0.0 | 0.12 -Modify | 0.025306 | 0.025306 | 0.025306 | 0.0 | 0.82 -Other | | 0.01525 | | | 0.50 +Pair | 2.8439 | 2.8439 | 2.8439 | 0.0 | 94.26 +Neigh | 0.11212 | 0.11212 | 0.11212 | 0.0 | 3.72 +Comm | 0.015585 | 0.015585 | 0.015585 | 0.0 | 0.52 +Output | 0.003747 | 0.003747 | 0.003747 | 0.0 | 0.12 +Modify | 0.026097 | 0.026097 | 0.026097 | 0.0 | 0.87 +Other | | 0.01551 | | | 0.51 Nlocal: 7200 ave 7200 max 7200 min Histogram: 1 0 0 0 0 0 0 0 0 0 @@ -91,25 +107,40 @@ Ave neighs/atom = 85.429861 Neighbor list builds = 9 Dangerous builds = 0 -# Chunk and stress along x direction +#------------------------------------# +# Chunk, Stress, and VCM removal steps +#------------------------------------# +# 1. Create 20 equispaced bins sliced along the x direction. +# "units reduced" normalizes the distance from 0 to 1 variable nbins index 20 variable fraction equal 1.0/v_nbins variable volfrac equal 1/(vol*${fraction}) variable volfrac equal 1/(vol*0.05) - compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced compute ch_id all chunk/atom bin/1d x lower 0.05 units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# The velocities from specified temp-ID are used to compute stress +# Stress/atom units are pressure*volume! Optionally handled next step. compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) variable stress atom -(c_atom_stress_vcm[1])/(vol*0.05) + +# 5. Sum the per atom stresses in each bin. compute ch_stress_vcm all reduce/chunk ch_id sum v_stress -# Output stress profile in x direction - +# 6. Average and output to file. +# The average output is every 100 steps with samples collected 20 times with 5 step intervals # fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out +#------------------------------------# + # Piston compressing along x direction region piston block -1 1 INF INF INF INF units lattice @@ -144,20 +175,20 @@ Per MPI rank memory allocation (min/avg/max) = 6.975 | 6.975 | 6.975 Mbytes 800 484.96279 451.27911 -1875.379 144.6903 64.3068 64.3068 5362.254 4174.4201 4166.0818 -1424.0999 900 587.78954 546.96391 -1871.217 144.6903 64.3068 64.3068 6481.4714 4875.705 4676.6083 -1324.2531 1000 684.07997 636.56636 -1868.1639 144.6903 64.3068 64.3068 7734.6158 5271.3524 5272.1276 -1231.5975 -Loop time of 3.32746 on 1 procs for 500 steps with 7200 atoms +Loop time of 3.09383 on 1 procs for 500 steps with 7200 atoms -Performance: 25.966 ns/day, 0.924 hours/ns, 150.265 timesteps/s, 1.082 Matom-step/s +Performance: 27.927 ns/day, 0.859 hours/ns, 161.612 timesteps/s, 1.164 Matom-step/s 100.0% CPU use with 1 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total --------------------------------------------------------------- -Pair | 3.0701 | 3.0701 | 3.0701 | 0.0 | 92.27 -Neigh | 0.20567 | 0.20567 | 0.20567 | 0.0 | 6.18 -Comm | 0.010313 | 0.010313 | 0.010313 | 0.0 | 0.31 -Output | 0.002649 | 0.002649 | 0.002649 | 0.0 | 0.08 -Modify | 0.029567 | 0.029567 | 0.029567 | 0.0 | 0.89 -Other | | 0.009157 | | | 0.28 +Pair | 2.8485 | 2.8485 | 2.8485 | 0.0 | 92.07 +Neigh | 0.18767 | 0.18767 | 0.18767 | 0.0 | 6.07 +Comm | 0.011533 | 0.011533 | 0.011533 | 0.0 | 0.37 +Output | 0.003323 | 0.003323 | 0.003323 | 0.0 | 0.11 +Modify | 0.031777 | 0.031777 | 0.031777 | 0.0 | 1.03 +Other | | 0.01107 | | | 0.36 Nlocal: 7200 ave 7200 max 7200 min Histogram: 1 0 0 0 0 0 0 0 0 0 @@ -193,20 +224,20 @@ Per MPI rank memory allocation (min/avg/max) = 6.6 | 6.6 | 6.6 Mbytes 2300 563.20394 524.08593 -1733.6036 144.6903 64.3068 64.3068 -220.99189 -810.90513 -774.65084 -1209.5176 2400 540.44236 502.90528 -1711.3384 144.6903 64.3068 64.3068 -358.01508 -962.31635 -977.3253 -1208.4332 2500 523.5718 487.20648 -1694.7088 144.6903 64.3068 64.3068 -521.87444 -1152.8386 -1231.7615 -1207.5023 -Loop time of 9.89185 on 1 procs for 1500 steps with 7200 atoms +Loop time of 9.34327 on 1 procs for 1500 steps with 7200 atoms -Performance: 26.203 ns/day, 0.916 hours/ns, 151.640 timesteps/s, 1.092 Matom-step/s -98.7% CPU use with 1 MPI tasks x 1 OpenMP threads +Performance: 27.742 ns/day, 0.865 hours/ns, 160.543 timesteps/s, 1.156 Matom-step/s +98.5% CPU use with 1 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total --------------------------------------------------------------- -Pair | 8.9768 | 8.9768 | 8.9768 | 0.0 | 90.75 -Neigh | 0.78114 | 0.78114 | 0.78114 | 0.0 | 7.90 -Comm | 0.035178 | 0.035178 | 0.035178 | 0.0 | 0.36 -Output | 0.009593 | 0.009593 | 0.009593 | 0.0 | 0.10 -Modify | 0.057521 | 0.057521 | 0.057521 | 0.0 | 0.58 -Other | | 0.0316 | | | 0.32 +Pair | 8.4692 | 8.4692 | 8.4692 | 0.0 | 90.65 +Neigh | 0.7512 | 0.7512 | 0.7512 | 0.0 | 8.04 +Comm | 0.031189 | 0.031189 | 0.031189 | 0.0 | 0.33 +Output | 0.010584 | 0.010584 | 0.010584 | 0.0 | 0.11 +Modify | 0.053052 | 0.053052 | 0.053052 | 0.0 | 0.57 +Other | | 0.02803 | | | 0.30 Nlocal: 7200 ave 7200 max 7200 min Histogram: 1 0 0 0 0 0 0 0 0 0 @@ -219,4 +250,4 @@ Total # of neighbors = 515773 Ave neighs/atom = 71.635139 Neighbor list builds = 57 Dangerous builds = 0 -Total wall time: 0:00:16 +Total wall time: 0:00:15 diff --git a/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 new file mode 100644 index 0000000000..d8e7c07536 --- /dev/null +++ b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 @@ -0,0 +1,253 @@ +LAMMPS (19 Nov 2024) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +Lattice spacing in x,y,z = 5.3589 5.3589 5.3589 +processors 1 * * + +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +Created orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + 1 by 2 by 2 MPI processor grid +create_atoms 1 region box2 +Created 7200 atoms + using lattice units in orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + create_atoms CPU = 0.001 seconds + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 12 + ghost atom cutoff = 12 + binsize = 6, bins = 25 11 11 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut/opt, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.662 | 3.662 | 3.662 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 600 -2252.7567 0 -1694.4304 -974.62456 + 100 284.1896 -1976.961 0 -1712.5101 2462.6396 + 200 308.58965 -1998.6349 0 -1711.4787 1789.0033 + 300 300.55093 -1989.9838 0 -1710.308 1545.8576 + 400 297.91491 -1986.2519 0 -1709.029 1247.7121 + 500 294.66041 -1982.1097 0 -1707.9153 961.03073 +Loop time of 0.942408 on 4 procs for 500 steps with 7200 atoms + +Performance: 91.680 ns/day, 0.262 hours/ns, 530.556 timesteps/s, 3.820 Matom-step/s +82.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.61287 | 0.63781 | 0.65858 | 2.1 | 67.68 +Neigh | 0.030246 | 0.031529 | 0.034546 | 1.0 | 3.35 +Comm | 0.23074 | 0.25145 | 0.27819 | 3.7 | 26.68 +Output | 0.000282 | 0.0003735 | 0.000463 | 0.0 | 0.04 +Modify | 0.005566 | 0.0057635 | 0.005989 | 0.2 | 0.61 +Other | | 0.01548 | | | 1.64 + +Nlocal: 1800 ave 1814 max 1787 min +Histogram: 1 0 1 0 0 0 0 1 0 1 +Nghost: 3713.5 ave 3727 max 3699 min +Histogram: 1 0 1 0 0 0 0 1 0 1 +Neighs: 153532 ave 154995 max 152312 min +Histogram: 1 0 1 0 0 1 0 0 0 1 + +Total # of neighbors = 614128 +Ave neighs/atom = 85.295556 +Neighbor list builds = 9 +Dangerous builds = 0 + +#------------------------------------# +# Chunk, Stress, and VCM removal steps +#------------------------------------# + +# 1. Create 20 equispaced bins sliced along the x direction. +# "units reduced" normalizes the distance from 0 to 1 +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +variable volfrac equal 1/(vol*0.05) +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced +compute ch_id all chunk/atom bin/1d x lower 0.05 units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. +compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# The velocities from specified temp-ID are used to compute stress +# Stress/atom units are pressure*volume! Optionally handled next step. +compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) +variable stress atom -(c_atom_stress_vcm[1])/(vol*0.05) + +# 5. Sum the per atom stresses in each bin. +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# 6. Average and output to file. +# The average output is every 100 steps with samples collected 20 times with 5 step intervals +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +#------------------------------------# + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +864 atoms in group piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: One or more atoms are time integrated more than once (src/modify.cpp:296) +Per MPI rank memory allocation (min/avg/max) = 4.916 | 4.916 | 4.916 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 500 294.66041 274.19441 -1982.1097 144.6903 64.3068 64.3068 645.25795 1119.5337 1118.3006 -1707.9153 + 600 357.88641 333.02897 -1951.8158 144.6903 64.3068 64.3068 2176.0343 1929.2787 1981.8479 -1618.7869 + 700 418.41159 389.3503 -1912.8337 144.6903 64.3068 64.3068 3702.2875 3043.7607 3081.1607 -1523.4834 + 800 483.71102 450.11428 -1875.7955 144.6903 64.3068 64.3068 5254.3875 4190.9789 4158.3561 -1425.6813 + 900 586.0893 545.38176 -1870.9313 144.6903 64.3068 64.3068 6509.1439 4756.2216 4724.7086 -1325.5495 + 1000 686.32946 638.65962 -1874.811 144.6903 64.3068 64.3068 7515.1606 5193.049 5261.8688 -1236.1514 +Loop time of 0.656417 on 4 procs for 500 steps with 7200 atoms + +Performance: 131.624 ns/day, 0.182 hours/ns, 761.711 timesteps/s, 5.484 Matom-step/s +92.8% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.51672 | 0.52334 | 0.53259 | 0.8 | 79.73 +Neigh | 0.045091 | 0.045915 | 0.047402 | 0.4 | 6.99 +Comm | 0.060735 | 0.071794 | 0.079302 | 2.6 | 10.94 +Output | 0.000208 | 0.000389 | 0.000926 | 0.0 | 0.06 +Modify | 0.006007 | 0.0061595 | 0.00626 | 0.1 | 0.94 +Other | | 0.008815 | | | 1.34 + +Nlocal: 1800 ave 1811 max 1785 min +Histogram: 1 0 0 1 0 0 0 0 0 2 +Nghost: 3713.25 ave 3727 max 3702 min +Histogram: 2 0 0 0 0 0 0 1 0 1 +Neighs: 161477 ave 162958 max 159732 min +Histogram: 1 0 0 0 1 0 0 1 0 1 + +Total # of neighbors = 645909 +Ave neighs/atom = 89.709583 +Neighbor list builds = 15 +Dangerous builds = 0 + +unfix fix_piston + +run 1500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 4.541 | 4.541 | 4.541 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 1000 686.32946 638.65962 -1874.811 144.6903 64.3068 64.3068 7515.1606 5193.049 5261.8688 -1236.1514 + 1100 709.7333 660.43791 -1898.2844 144.6903 64.3068 64.3068 7932.8638 5334.6171 5364.5335 -1237.8465 + 1200 713.27253 663.73132 -1902.4588 144.6903 64.3068 64.3068 7957.2574 5500.6231 5538.0516 -1238.7275 + 1300 705.44796 656.45022 -1895.1575 144.6903 64.3068 64.3068 7996.7265 5584.6233 5538.2494 -1238.7072 + 1400 711.86463 662.42121 -1899.8416 144.6903 64.3068 64.3068 7674.2462 5292.4915 5294.5366 -1237.4204 + 1500 742.18946 690.63979 -1924.9562 144.6903 64.3068 64.3068 6047.915 4056.6156 4014.4446 -1234.3164 + 1600 762.81764 709.83522 -1939.8563 144.6903 64.3068 64.3068 4185.5873 2530.0572 2576.1943 -1230.0211 + 1700 754.40428 702.00621 -1927.7337 144.6903 64.3068 64.3068 2662.7604 1509.1985 1484.7252 -1225.7275 + 1800 721.03504 670.95468 -1893.5556 144.6903 64.3068 64.3068 1765.8783 835.89765 861.9432 -1222.6009 + 1900 689.64162 641.74172 -1861.8886 144.6903 64.3068 64.3068 941.58148 312.93205 409.79901 -1220.1469 + 2000 650.79664 605.59477 -1823.9889 144.6903 64.3068 64.3068 543.39234 28.48735 80.396505 -1218.3941 + 2100 616.04072 573.25286 -1790.1764 144.6903 64.3068 64.3068 308.16444 -235.20997 -248.22531 -1216.9235 + 2200 587.18712 546.40333 -1761.8878 144.6903 64.3068 64.3068 37.044801 -476.50396 -470.83059 -1215.4845 + 2300 562.84178 523.74892 -1738.2239 144.6903 64.3068 64.3068 -139.28348 -711.17273 -730.80877 -1214.475 + 2400 540.48362 502.94367 -1716.3529 144.6903 64.3068 64.3068 -320.98222 -951.2066 -943.93966 -1213.4093 + 2500 519.80431 483.70067 -1696.1896 144.6903 64.3068 64.3068 -471.61317 -1088.8457 -1131.5396 -1212.4889 +Loop time of 1.97213 on 4 procs for 1500 steps with 7200 atoms + +Performance: 131.431 ns/day, 0.183 hours/ns, 760.598 timesteps/s, 5.476 Matom-step/s +95.3% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 1.5455 | 1.5599 | 1.5723 | 0.8 | 79.10 +Neigh | 0.16844 | 0.1704 | 0.17237 | 0.4 | 8.64 +Comm | 0.19002 | 0.2047 | 0.22068 | 2.4 | 10.38 +Output | 0.000525 | 0.0006785 | 0.001077 | 0.0 | 0.03 +Modify | 0.012434 | 0.012601 | 0.012777 | 0.1 | 0.64 +Other | | 0.02388 | | | 1.21 + +Nlocal: 1800 ave 1833 max 1776 min +Histogram: 1 0 1 0 1 0 0 0 0 1 +Nghost: 3702 ave 3732 max 3674 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Neighs: 129380 ave 132578 max 127003 min +Histogram: 1 0 0 2 0 0 0 0 0 1 + +Total # of neighbors = 517520 +Ave neighs/atom = 71.877778 +Neighbor list builds = 54 +Dangerous builds = 0 +Total wall time: 0:00:03 diff --git a/examples/vcm/in.vcm.lmp b/examples/vcm/in.vcm.lmp deleted file mode 100644 index 41d7665c49..0000000000 --- a/examples/vcm/in.vcm.lmp +++ /dev/null @@ -1,82 +0,0 @@ -# Removing Binned Center-of-Mass Velocities from Stress Compute - -units metal -boundary p p p -atom_style atomic -lattice fcc 5.3589 -processors 1 * * - -# Defining regions for box and atoms - -region box1 block -3 24 0 12 0 12 units lattice -region box2 block 0 12 0 12 0 12 units lattice - -# Creating box and atoms - -create_box 1 box1 -create_atoms 1 region box2 - -mass 1 40.00 - -# Adding energy to the system - -velocity all create 600.0 9999 - -pair_style lj/cut 10 -pair_coeff 1 1 0.04 3.405 - -# Begin time integration - -timestep 2e-3 - -fix fix_nve all nve - -thermo 100 - -run 500 - -# Chunk and stress along x direction - -variable nbins index 20 -variable fraction equal 1.0/v_nbins -variable volfrac equal 1/(vol*${fraction}) - -compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced -compute ch_temp_vcm all temp/chunk ch_id com yes -compute atom_stress_vcm all stress/atom ch_temp_vcm -variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) -compute ch_stress_vcm all reduce/chunk ch_id sum v_stress - -# Output stress profile in x direction - -# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out - -# Piston compressing along x direction - -region piston block -1 1 INF INF INF INF units lattice -group piston region piston -fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s - -thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve - -# Atom dump - -# dump atom_dump all atom 50 dump.vcm - -# # Image dumps - -# dump 2 all image 250 image.*.jpg type type & -# axes yes 0.8 0.02 view 60 -30 -# dump_modify 2 pad 1 - -# # Movie dump - -# dump 3 all movie 125 movie.avi type type & -# axes yes 0.8 0.02 view 60 -30 -# dump_modify 3 pad 1 - -run 500 - -unfix fix_piston - -run 1500 From 8f25a31deea049e417e7d219b4cb706b7027f64f Mon Sep 17 00:00:00 2001 From: farrelljd Date: Thu, 12 Dec 2024 15:38:34 +0800 Subject: [PATCH 025/161] adapted angle_mwlc to work with units other than lj --- doc/src/angle_mwlc.rst | 23 ++++++++++---------- doc/src/fix_adapt.rst | 2 ++ src/EXTRA-MOLECULE/angle_mwlc.cpp | 36 +++++++++++++++++++++---------- src/EXTRA-MOLECULE/angle_mwlc.h | 1 + 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/doc/src/angle_mwlc.rst b/doc/src/angle_mwlc.rst index e70c518bd7..4cdd36a494 100644 --- a/doc/src/angle_mwlc.rst +++ b/doc/src/angle_mwlc.rst @@ -16,7 +16,7 @@ Examples .. code-block:: LAMMPS angle_style mwlc - angle_coeff * 25.0 1.0 10.0 + angle_coeff * 25 1 10 1 Description """"""""""" @@ -26,16 +26,17 @@ a non-melted and a melted state :ref:`(Farrell) `, .. math:: - \beta E = -\log [q + q^{m}], + E = -k_{B}T\,\log [q + q^{m}] + E_{0}, where .. math:: - q = \exp [-l_{p}(1-\cos{\theta})/\sigma], \\ - q^{m} = \exp [-\beta\mu-l_{p}^{m}(1-\cos{\theta})/\sigma], + q = \exp [-k_{1}(1+\cos{\theta})/k_{B}T], \\ + q^{m} = \exp [-(\mu+k_{2}(1+\cos{\theta}))/k_{B}T], \\ + E_{0} = -k_{B}T\,\log [1 + \exp[-\mu/k_{B}T]], -:math:`l_{p}` is the persistence length of the non-melted state, -:math:`l_{p}^{m}` is the persistence length of the melted state, +:math:`k_1` is the bending force constant of the non-melted state, +:math:`k_2` is the bending force constant of the melted state, and :math:`\mu` is the melting energy. This potential is a continuous version of the two-state potential @@ -46,10 +47,10 @@ The following coefficients must be defined for each angle type via the the data file or restart files read by the :doc:`read_data ` or :doc:`read_restart ` commands: -* :math:`l_{p}` (distance) -* :math:`l_{p}^{m}` (distance) +* :math:`k_1` (energy) +* :math:`k_2` (energy) * :math:`\mu` (energy) - +* :math:`T` (temperature) ---------- @@ -74,8 +75,8 @@ none .. _Farrell: -**(Farrell)** Farrell, Dobnikar, Podgornik, Curk, Phys Rev Lett, in production. +**(Farrell)** `Farrell, Dobnikar, Podgornik, Curk, Phys Rev Lett, 133, 148101 (2024). `_ .. _Yan: -**(Yan)** Yan, Marko, Phys Rev Lett, 93, 108108 (2004). +**(Yan)** `Yan, Marko, Phys Rev Lett, 93, 108108 (2004). `_ diff --git a/doc/src/fix_adapt.rst b/doc/src/fix_adapt.rst index a44ce8e780..e1f7653f03 100644 --- a/doc/src/fix_adapt.rst +++ b/doc/src/fix_adapt.rst @@ -388,6 +388,8 @@ sub-style name. The angle styles that currently work with fix adapt are: +--------------------------------------------------------------------+-----------------+-------------+ | :doc:`mm3 ` | k,theta0 | type angles | +--------------------------------------------------------------------+-----------------+-------------+ +| :doc:`mwlc ` | lp,lpm,mu | type angles | ++--------------------------------------------------------------------+-----------------+-------------+ | :doc:`quartic ` | k2,k3,k4,theta0 | type angles | +--------------------------------------------------------------------+-----------------+-------------+ | :doc:`spica ` | k,theta0 | type angles | diff --git a/src/EXTRA-MOLECULE/angle_mwlc.cpp b/src/EXTRA-MOLECULE/angle_mwlc.cpp index 41b1195bc4..dfd6c64ae3 100644 --- a/src/EXTRA-MOLECULE/angle_mwlc.cpp +++ b/src/EXTRA-MOLECULE/angle_mwlc.cpp @@ -49,6 +49,7 @@ AngleMWLC::~AngleMWLC() memory->destroy(k1); memory->destroy(k2); memory->destroy(mu); + memory->destroy(temp); } } @@ -71,12 +72,15 @@ void AngleMWLC::compute(int eflag, int vflag) int nanglelist = neighbor->nanglelist; int nlocal = atom->nlocal; int newton_bond = force->newton_bond; + double kbt, v_min; for (n = 0; n < nanglelist; n++) { i1 = anglelist[n][0]; i2 = anglelist[n][1]; i3 = anglelist[n][2]; type = anglelist[n][3]; + kbt = temp[type]*force->boltz; + v_min = -kbt * log(1 + exp(-mu[type] / kbt)); // 1st bond @@ -105,11 +109,11 @@ void AngleMWLC::compute(int eflag, int vflag) // force & energy - q = exp(-k1[type] * (1.0 + c)); - qm = exp(-k2[type] * (1.0 + c) - mu[type]); + q = exp(-k1[type] * (1.0 + c) / kbt); + qm = exp((-k2[type] * (1.0 + c) - mu[type]) / kbt); Q = q + qm; - if (eflag) eangle = -log(Q); + if (eflag) eangle = -kbt*log(Q) - v_min; a = (k1[type] * q + k2[type] * qm) / Q; a11 = a * c / rsq1; @@ -159,6 +163,7 @@ void AngleMWLC::allocate() memory->create(k1, np1, "angle:k1"); memory->create(k2, np1, "angle:k2"); memory->create(mu, np1, "angle:mu"); + memory->create(temp, np1, "angle:temp"); memory->create(setflag, np1, "angle:setflag"); for (int i = 1; i < np1; i++) setflag[i] = 0; } @@ -169,7 +174,7 @@ void AngleMWLC::allocate() void AngleMWLC::coeff(int narg, char **arg) { - if (narg != 4) error->all(FLERR, "Incorrect args for angle coefficients"); + if (narg != 5) error->all(FLERR, "Incorrect args for angle coefficients"); if (!allocated) allocate(); int ilo, ihi; @@ -178,12 +183,14 @@ void AngleMWLC::coeff(int narg, char **arg) double k1_one = utils::numeric(FLERR, arg[1], false, lmp); double k2_one = utils::numeric(FLERR, arg[2], false, lmp); double mu_one = utils::numeric(FLERR, arg[3], false, lmp); + double temp_one = utils::numeric(FLERR, arg[4], false, lmp); int count = 0; for (int i = ilo; i <= ihi; i++) { k1[i] = k1_one; k2[i] = k2_one; mu[i] = mu_one; + temp[i] = temp_one; setflag[i] = 1; count++; } @@ -207,6 +214,7 @@ void AngleMWLC::write_restart(FILE *fp) fwrite(&k1[1], sizeof(double), atom->nangletypes, fp); fwrite(&k2[1], sizeof(double), atom->nangletypes, fp); fwrite(&mu[1], sizeof(double), atom->nangletypes, fp); + fwrite(&temp[1], sizeof(double), atom->nangletypes, fp); } /* ---------------------------------------------------------------------- @@ -221,10 +229,12 @@ void AngleMWLC::read_restart(FILE *fp) utils::sfread(FLERR, &k1[1], sizeof(double), atom->nangletypes, fp, nullptr, error); utils::sfread(FLERR, &k2[1], sizeof(double), atom->nangletypes, fp, nullptr, error); utils::sfread(FLERR, &mu[1], sizeof(double), atom->nangletypes, fp, nullptr, error); + utils::sfread(FLERR, &temp[1], sizeof(double), atom->nangletypes, fp, nullptr, error); } MPI_Bcast(&k1[1], atom->nangletypes, MPI_DOUBLE, 0, world); MPI_Bcast(&k2[1], atom->nangletypes, MPI_DOUBLE, 0, world); MPI_Bcast(&mu[1], atom->nangletypes, MPI_DOUBLE, 0, world); + MPI_Bcast(&temp[1], atom->nangletypes, MPI_DOUBLE, 0, world); for (int i = 1; i <= atom->nangletypes; i++) setflag[i] = 1; } @@ -235,7 +245,7 @@ void AngleMWLC::read_restart(FILE *fp) void AngleMWLC::write_data(FILE *fp) { - for (int i = 1; i <= atom->nangletypes; i++) fprintf(fp, "%d %g %g %g\n", i, k1[i], k2[i], mu[i]); + for (int i = 1; i <= atom->nangletypes; i++) fprintf(fp, "%d %g %g %g %g\n", i, k1[i], k2[i], mu[i], temp[i]); } /* ---------------------------------------------------------------------- */ @@ -244,6 +254,8 @@ double AngleMWLC::single(int type, int i1, int i2, int i3) { double **x = atom->x; + double kbt = temp[type]*force->boltz; + double v_min = -kbt * log(1 + exp(-mu[type] / kbt)); double delx1 = x[i1][0] - x[i2][0]; double dely1 = x[i1][1] - x[i2][1]; double delz1 = x[i1][2] - x[i2][2]; @@ -261,9 +273,9 @@ double AngleMWLC::single(int type, int i1, int i2, int i3) if (c > 1.0) c = 1.0; if (c < -1.0) c = -1.0; - double q = exp(-k1[type] * (1.0 + c)); - double qm = exp(-k2[type] * (1.0 + c) - mu[type]); - return -log(q + qm); + double q = exp(-k1[type] * (1.0 + c) / kbt); + double qm = exp((-k2[type] * (1.0 + c) - mu[type]) / kbt); + return -kbt * log(q + qm) - v_min; } /* ---------------------------------------------------------------------- */ @@ -271,6 +283,7 @@ double AngleMWLC::single(int type, int i1, int i2, int i3) void AngleMWLC::born_matrix(int type, int i1, int i2, int i3, double &du, double &du2) { double **x = atom->x; + double kbt = temp[type]*force->boltz; double delx1 = x[i1][0] - x[i2][0]; double dely1 = x[i1][1] - x[i2][1]; @@ -288,13 +301,13 @@ void AngleMWLC::born_matrix(int type, int i1, int i2, int i3, double &du, double if (c > 1.0) c = 1.0; if (c < -1.0) c = -1.0; - const double q = exp(-k1[type] * (1.0 + c)); - const double qm = exp(-k2[type] * (1.0 + c) - mu[type]); + const double q = exp(-k1[type] * (1.0 + c) / kbt); + const double qm = exp((-k2[type] * (1.0 + c) - mu[type]) / kbt); const double Q = q + qm; du = (k1[type] * q + k2[type] * qm) / Q; du2 = (k1[type] - k2[type]) / Q; - du2 *= -du2 * q * qm; + du2 *= -du2 * q * qm / kbt; } /* ---------------------------------------------------------------------- @@ -307,5 +320,6 @@ void *AngleMWLC::extract(const char *str, int &dim) if (strcmp(str, "k1") == 0) return (void *) k1; if (strcmp(str, "k2") == 0) return (void *) k2; if (strcmp(str, "mu") == 0) return (void *) mu; + if (strcmp(str, "temp") == 0) return (void *) temp; return nullptr; } diff --git a/src/EXTRA-MOLECULE/angle_mwlc.h b/src/EXTRA-MOLECULE/angle_mwlc.h index 512d1d1534..ec5f0e2c0c 100644 --- a/src/EXTRA-MOLECULE/angle_mwlc.h +++ b/src/EXTRA-MOLECULE/angle_mwlc.h @@ -42,6 +42,7 @@ namespace LAMMPS_NS { double *k1; double *k2; double *mu; + double *temp; virtual void allocate(); }; From d50a62f827479ca24d5f7e6393981a4af72ba373 Mon Sep 17 00:00:00 2001 From: Tyler Collins Date: Thu, 12 Dec 2024 18:32:46 -0800 Subject: [PATCH 026/161] example readme updated --- examples/README | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/README b/examples/README index 90831b49f0..b25dda7d49 100644 --- a/examples/README +++ b/examples/README @@ -107,6 +107,7 @@ reaxff: RDX and TATB and several other models using ReaxFF replicate: use of replicate command rerun: use of rerun and read_dump commands rigid: rigid bodies modeled as independent or coupled +stress_vcm: removing binned rigid body motion from binned stress profile shear: sideways shear applied to 2d solid, with and without a void snap: examples for using several bundled SNAP potentials srd: stochastic rotation dynamics (SRD) particles as solvent From 24a4ff78b6ba5264236aa5f58d978498dd921340 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Mon, 4 Nov 2024 09:19:38 -0700 Subject: [PATCH 027/161] python: update examples and docs --- doc/src/Howto.rst | 1 + doc/src/Howto_pylammps.rst | 564 +----------------- doc/src/Howto_python.rst | 488 +++++++++++++++ doc/src/Library.rst | 7 +- doc/src/Python_atoms.rst | 68 +-- doc/src/Python_create.rst | 107 +--- doc/src/Python_execute.rst | 162 +++-- doc/src/Python_module.rst | 55 +- doc/src/Python_overview.rst | 19 +- doc/utils/requirements.txt | 1 + .../examples/{pylammps => ipython}/.gitignore | 0 .../examples/{pylammps => ipython}/README.md | 16 +- .../atoms.ipynb} | 108 ++-- .../dihedrals/data.dihedral | 0 .../dihedrals/dihedral.ipynb | 73 +-- .../{pylammps => ipython}/elastic/Au.data | 0 .../{pylammps => ipython}/elastic/README | 0 .../{pylammps => ipython}/elastic/elastic.py | 3 +- python/examples/ipython/index.ipynb | 61 ++ .../{pylammps => ipython}/montecarlo/mc.ipynb | 93 ++- .../{pylammps => ipython}/mpi4py/hello.py | 0 .../{pylammps => ipython}/mpi4py/in.melt | 0 .../{pylammps => ipython}/mpi4py/melt.py | 6 +- .../{pylammps => ipython}/simple.ipynb | 304 +++++----- python/examples/ipython/thermo.ipynb | 305 ++++++++++ .../examples/pylammps/interface_usage.ipynb | 546 ----------------- python/lammps/core.py | 138 +++++ python/lammps/ipython/__init__.py | 23 + python/lammps/ipython/magics.py | 75 +++ python/lammps/ipython/wrapper.py | 113 ++++ python/setup.py | 2 +- 31 files changed, 1608 insertions(+), 1730 deletions(-) create mode 100644 doc/src/Howto_python.rst rename python/examples/{pylammps => ipython}/.gitignore (100%) rename python/examples/{pylammps => ipython}/README.md (80%) rename python/examples/{pylammps/interface_usage_bonds.ipynb => ipython/atoms.ipynb} (74%) rename python/examples/{pylammps => ipython}/dihedrals/data.dihedral (100%) rename python/examples/{pylammps => ipython}/dihedrals/dihedral.ipynb (75%) rename python/examples/{pylammps => ipython}/elastic/Au.data (100%) rename python/examples/{pylammps => ipython}/elastic/README (100%) rename python/examples/{pylammps => ipython}/elastic/elastic.py (99%) create mode 100644 python/examples/ipython/index.ipynb rename python/examples/{pylammps => ipython}/montecarlo/mc.ipynb (75%) rename python/examples/{pylammps => ipython}/mpi4py/hello.py (100%) rename python/examples/{pylammps => ipython}/mpi4py/in.melt (100%) rename python/examples/{pylammps => ipython}/mpi4py/melt.py (61%) rename python/examples/{pylammps => ipython}/simple.ipynb (53%) create mode 100644 python/examples/ipython/thermo.ipynb delete mode 100644 python/examples/pylammps/interface_usage.ipynb create mode 100644 python/lammps/ipython/__init__.py create mode 100644 python/lammps/ipython/magics.py create mode 100644 python/lammps/ipython/wrapper.py diff --git a/doc/src/Howto.rst b/doc/src/Howto.rst index 5a63e2b1c4..16620bf47a 100644 --- a/doc/src/Howto.rst +++ b/doc/src/Howto.rst @@ -104,5 +104,6 @@ Tutorials howto Howto_lammps_gui Howto_moltemplate Howto_pylammps + Howto_python Howto_wsl diff --git a/doc/src/Howto_pylammps.rst b/doc/src/Howto_pylammps.rst index 645434bbab..a8371f1366 100644 --- a/doc/src/Howto_pylammps.rst +++ b/doc/src/Howto_pylammps.rst @@ -1,564 +1,6 @@ PyLammps Tutorial ================= -.. contents:: - -Overview --------- - -:py:class:`PyLammps ` is a Python wrapper class for -LAMMPS which can be created on its own or use an existing -:py:class:`lammps Python ` object. It creates a simpler, -more "pythonic" interface to common LAMMPS functionality, in contrast to -the :py:class:`lammps ` wrapper for the LAMMPS :ref:`C -language library interface API ` which is written using -`Python ctypes `_. The :py:class:`lammps ` -wrapper is discussed on the :doc:`Python_head` doc page. - -Unlike the flat `ctypes `_ interface, PyLammps exposes a -discoverable API. It no longer requires knowledge of the underlying C++ -code implementation. Finally, the :py:class:`IPyLammps -` wrapper builds on top of :py:class:`PyLammps -` and adds some additional features for `IPython -integration `_ into `Jupyter notebooks `_, e.g. for -embedded visualization output from :doc:`dump style image `. - -.. _ctypes: https://docs.python.org/3/library/ctypes.html -.. _ipython: https://ipython.org/ -.. _jupyter: https://jupyter.org/ - -Comparison of lammps and PyLammps interfaces -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -lammps.lammps -""""""""""""" - -* uses `ctypes `_ -* direct memory access to native C++ data with optional support for NumPy arrays -* provides functions to send and receive data to LAMMPS -* interface modeled after the LAMMPS :ref:`C language library interface API ` -* requires knowledge of how LAMMPS internally works (C pointers, etc) -* full support for running Python with MPI using `mpi4py `_ -* no overhead from creating a more Python-like interface - -lammps.PyLammps -""""""""""""""" - -* higher-level abstraction built on *top* of the original :py:class:`ctypes based interface ` -* manipulation of Python objects -* communication with LAMMPS is hidden from API user -* shorter, more concise Python -* better IPython integration, designed for quick prototyping -* designed for serial execution -* additional overhead from capturing and parsing the LAMMPS screen output - -Quick Start ------------ - -System-wide Installation -^^^^^^^^^^^^^^^^^^^^^^^^ - -Step 1: Building LAMMPS as a shared library -""""""""""""""""""""""""""""""""""""""""""" - -To use LAMMPS inside of Python it has to be compiled as shared -library. This library is then loaded by the Python interface. In this -example we enable the MOLECULE package and compile LAMMPS with PNG, JPEG -and FFMPEG output support enabled. - -Step 1a: For the CMake based build system, the steps are: - -.. code-block:: bash - - mkdir $LAMMPS_DIR/build-shared - cd $LAMMPS_DIR/build-shared - - # MPI, PNG, Jpeg, FFMPEG are auto-detected - cmake ../cmake -DPKG_MOLECULE=yes -DBUILD_LIB=yes -DBUILD_SHARED_LIBS=yes - make - -Step 1b: For the legacy, make based build system, the steps are: - -.. code-block:: bash - - cd $LAMMPS_DIR/src - - # add packages if necessary - make yes-MOLECULE - - # compile shared library using Makefile - make mpi mode=shlib LMP_INC="-DLAMMPS_PNG -DLAMMPS_JPEG -DLAMMPS_FFMPEG" JPG_LIB="-lpng -ljpeg" - -Step 2: Installing the LAMMPS Python package -"""""""""""""""""""""""""""""""""""""""""""" - -PyLammps is part of the lammps Python package. To install it simply install -that package into your current Python installation with: - -.. code-block:: bash - - make install-python - -.. note:: - - Recompiling the shared library requires re-installing the Python package - -Installation inside of a virtualenv -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can use virtualenv to create a custom Python environment specifically tuned -for your workflow. - -Benefits of using a virtualenv -"""""""""""""""""""""""""""""" - -* isolation of your system Python installation from your development installation -* installation can happen in your user directory without root access (useful for HPC clusters) -* installing packages through pip allows you to get newer versions of packages than e.g., through apt-get or yum package managers (and without root access) -* you can even install specific old versions of a package if necessary - -**Prerequisite (e.g. on Ubuntu)** - -.. code-block:: bash - - apt-get install python-virtualenv - -Creating a virtualenv with lammps installed -""""""""""""""""""""""""""""""""""""""""""" - -.. code-block:: bash - - # create virtualenv named 'testing' - virtualenv $HOME/python/testing - - # activate 'testing' environment - source $HOME/python/testing/bin/activate - -Now configure and compile the LAMMPS shared library as outlined above. -When using CMake and the shared library has already been build, you -need to re-run CMake to update the location of the python executable -to the location in the virtual environment with: - -.. code-block:: bash - - cmake . -DPython_EXECUTABLE=$(which python) - - # install LAMMPS package in virtualenv - (testing) make install-python - - # install other useful packages - (testing) pip install matplotlib jupyter mpi4py - - ... - - # return to original shell - (testing) deactivate - -Creating a new instance of PyLammps ------------------------------------ - -To create a PyLammps object you need to first import the class from the lammps -module. By using the default constructor, a new *lammps* instance is created. - -.. code-block:: python - - from lammps import PyLammps - L = PyLammps() - -You can also initialize PyLammps on top of this existing *lammps* object: - -.. code-block:: python - - from lammps import lammps, PyLammps - lmp = lammps() - L = PyLammps(ptr=lmp) - -Commands --------- - -Sending a LAMMPS command with the existing library interfaces is done using -the command method of the lammps object instance. - -For instance, let's take the following LAMMPS command: - -.. code-block:: LAMMPS - - region box block 0 10 0 5 -0.5 0.5 - -In the original interface this command can be executed with the following -Python code if *L* was a lammps instance: - -.. code-block:: python - - L.command("region box block 0 10 0 5 -0.5 0.5") - -With the PyLammps interface, any command can be split up into arbitrary parts -separated by white-space, passed as individual arguments to a region method. - -.. code-block:: python - - L.region("box block", 0, 10, 0, 5, -0.5, 0.5) - -Note that each parameter is set as Python literal floating-point number. In the -PyLammps interface, each command takes an arbitrary parameter list and transparently -merges it to a single command string, separating individual parameters by white-space. - -The benefit of this approach is avoiding redundant command calls and easier -parameterization. In the original interface parameterization needed to be done -manually by creating formatted strings. - -.. code-block:: python - - L.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) - -In contrast, methods of PyLammps accept parameters directly and will convert -them automatically to a final command string. - -.. code-block:: python - - L.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) - -System state ------------- - -In addition to dispatching commands directly through the PyLammps object, it -also provides several properties which allow you to query the system state. - -L.system - Is a dictionary describing the system such as the bounding box or number of atoms - -L.system.xlo, L.system.xhi - bounding box limits along x-axis - -L.system.ylo, L.system.yhi - bounding box limits along y-axis - -L.system.zlo, L.system.zhi - bounding box limits along z-axis - -L.communication - configuration of communication subsystem, such as the number of threads or processors - -L.communication.nthreads - number of threads used by each LAMMPS process - -L.communication.nprocs - number of MPI processes used by LAMMPS - -L.fixes - List of fixes in the current system - -L.computes - List of active computes in the current system - -L.dump - List of active dumps in the current system - -L.groups - List of groups present in the current system - -Working with LAMMPS variables ------------------------------ - -LAMMPS variables can be both defined and accessed via the PyLammps interface. - -To define a variable you can use the :doc:`variable ` command: - -.. code-block:: python - - L.variable("a index 2") - -A dictionary of all variables is returned by L.variables - -you can access an individual variable by retrieving a variable object from the -L.variables dictionary by name - -.. code-block:: python - - a = L.variables['a'] - -The variable value can then be easily read and written by accessing the value -property of this object. - -.. code-block:: python - - print(a.value) - a.value = 4 - -Retrieving the value of an arbitrary LAMMPS expressions -------------------------------------------------------- - -LAMMPS expressions can be immediately evaluated by using the eval method. The -passed string parameter can be any expression containing global thermo values, -variables, compute or fix data. - -.. code-block:: python - - result = L.eval("ke") # kinetic energy - result = L.eval("pe") # potential energy - - result = L.eval("v_t/2.0") - -Accessing atom data -------------------- - -All atoms in the current simulation can be accessed by using the L.atoms list. -Each element of this list is an object which exposes its properties (id, type, -position, velocity, force, etc.). - -.. code-block:: python - - # access first atom - L.atoms[0].id - L.atoms[0].type - - # access second atom - L.atoms[1].position - L.atoms[1].velocity - L.atoms[1].force - -Some properties can also be used to set: - -.. code-block:: python - - # set position in 2D simulation - L.atoms[0].position = (1.0, 0.0) - - # set position in 3D simulation - L.atoms[0].position = (1.0, 0.0, 1.) - -Evaluating thermo data ----------------------- - -Each simulation run usually produces thermo output based on system state, -computes, fixes or variables. The trajectories of these values can be queried -after a run via the L.runs list. This list contains a growing list of run data. -The first element is the output of the first run, the second element that of -the second run. - -.. code-block:: python - - L.run(1000) - L.runs[0] # data of first 1000 time steps - - L.run(1000) - L.runs[1] # data of second 1000 time steps - -Each run contains a dictionary of all trajectories. Each trajectory is -accessible through its thermo name: - -.. code-block:: python - - L.runs[0].thermo.Step # list of time steps in first run - L.runs[0].thermo.Ke # list of kinetic energy values in first run - -Together with matplotlib plotting data out of LAMMPS becomes simple: - -.. code-block:: python - - import matplotlib.plot as plt - steps = L.runs[0].thermo.Step - ke = L.runs[0].thermo.Ke - plt.plot(steps, ke) - -Error handling with PyLammps ----------------------------- - -Using C++ exceptions in LAMMPS for errors allows capturing them on the -C++ side and rethrowing them on the Python side. This way you can handle -LAMMPS errors through the Python exception handling mechanism. - -.. warning:: - - Capturing a LAMMPS exception in Python can still mean that the - current LAMMPS process is in an illegal state and must be - terminated. It is advised to save your data and terminate the Python - instance as quickly as possible. - -Using PyLammps in IPython notebooks and Jupyter ------------------------------------------------ - -If the LAMMPS Python package is installed for the same Python interpreter as -IPython, you can use PyLammps directly inside of an IPython notebook inside of -Jupyter. Jupyter is a powerful integrated development environment (IDE) for -many dynamic languages like Python, Julia and others, which operates inside of -any web browser. Besides auto-completion and syntax highlighting it allows you -to create formatted documents using Markup, mathematical formulas, graphics and -animations intermixed with executable Python code. It is a great format for -tutorials and showcasing your latest research. - -To launch an instance of Jupyter simply run the following command inside your -Python environment (this assumes you followed the Quick Start instructions): - -.. code-block:: bash - - jupyter notebook - -IPyLammps Examples ------------------- - -Examples of IPython notebooks can be found in the python/examples/pylammps -subdirectory. To open these notebooks launch *jupyter notebook* inside this -directory and navigate to one of them. If you compiled and installed -a LAMMPS shared library with exceptions, PNG, JPEG and FFMPEG support -you should be able to rerun all of these notebooks. - -Validating a dihedral potential -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This example showcases how an IPython Notebook can be used to compare a simple -LAMMPS simulation of a harmonic dihedral potential to its analytical solution. -Four atoms are placed in the simulation and the dihedral potential is applied on -them using a datafile. Then one of the atoms is rotated along the central axis by -setting its position from Python, which changes the dihedral angle. - -.. code-block:: python - - phi = [d \* math.pi / 180 for d in range(360)] - - pos = [(1.0, math.cos(p), math.sin(p)) for p in phi] - - pe = [] - for p in pos: - L.atoms[3].position = p - L.run(0) - pe.append(L.eval("pe")) - -By evaluating the potential energy for each position we can verify that -trajectory with the analytical formula. To compare both solutions, we plot -both trajectories over each other using matplotlib, which embeds the generated -plot inside the IPython notebook. - -.. image:: JPG/pylammps_dihedral.jpg - :align: center - -Running a Monte Carlo relaxation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This second example shows how to use PyLammps to create a 2D Monte Carlo Relaxation -simulation, computing and plotting energy terms and even embedding video output. - -Initially, a 2D system is created in a state with minimal energy. - -.. image:: JPG/pylammps_mc_minimum.jpg - :align: center - -It is then disordered by moving each atom by a random delta. - -.. code-block:: python - - random.seed(27848) - deltaperturb = 0.2 - - for i in range(L.system.natoms): - x, y = L.atoms[i].position - dx = deltaperturb \* random.uniform(-1, 1) - dy = deltaperturb \* random.uniform(-1, 1) - L.atoms[i].position = (x+dx, y+dy) - - L.run(0) - -.. image:: JPG/pylammps_mc_disordered.jpg - :align: center - -Finally, the Monte Carlo algorithm is implemented in Python. It continuously -moves random atoms by a random delta and only accepts certain moves. - -.. code-block:: python - - estart = L.eval("pe") - elast = estart - - naccept = 0 - energies = [estart] - - niterations = 3000 - deltamove = 0.1 - kT = 0.05 - - natoms = L.system.natoms - - for i in range(niterations): - iatom = random.randrange(0, natoms) - current_atom = L.atoms[iatom] - - x0, y0 = current_atom.position - - dx = deltamove \* random.uniform(-1, 1) - dy = deltamove \* random.uniform(-1, 1) - - current_atom.position = (x0+dx, y0+dy) - - L.run(1, "pre no post no") - - e = L.eval("pe") - energies.append(e) - - if e <= elast: - naccept += 1 - elast = e - elif random.random() <= math.exp(natoms\*(elast-e)/kT): - naccept += 1 - elast = e - else: - current_atom.position = (x0, y0) - -The energies of each iteration are collected in a Python list and finally plotted using matplotlib. - -.. image:: JPG/pylammps_mc_energies_plot.jpg - :align: center - -The IPython notebook also shows how to use dump commands and embed video files -inside of the IPython notebook. - -Using PyLammps and mpi4py (Experimental) ----------------------------------------- - -PyLammps can be run in parallel using `mpi4py -`_. This python package can be installed -using - -.. code-block:: bash - - pip install mpi4py - -.. warning:: - - Usually, any :py:class:`PyLammps ` command must be - executed by *all* MPI processes. However, evaluations and querying - the system state is only available on MPI rank 0. Using these - functions from other MPI ranks will raise an exception. - -The following is a short example which reads in an existing LAMMPS input -file and executes it in parallel. You can find in.melt in the -examples/melt folder. Please take note that the -:py:meth:`PyLammps.eval() ` is called only from -MPI rank 0. - -.. code-block:: python - - from mpi4py import MPI - from lammps import PyLammps - - L = PyLammps() - L.file("in.melt") - - if MPI.COMM_WORLD.rank == 0: - print("Potential energy: ", L.eval("pe")) - - MPI.Finalize() - -To run this script (melt.py) in parallel using 4 MPI processes we invoke the -following mpirun command: - -.. code-block:: bash - - mpirun -np 4 python melt.py - -Feedback and Contributing -------------------------- - -If you find this Python interface useful, please feel free to provide feedback -and ideas on how to improve it to Richard Berger (richard.berger@outlook.com). We also -want to encourage people to write tutorial style IPython notebooks showcasing LAMMPS usage -and maybe their latest research results. +The PyLammps interface is deprecated and will be removed in a future release of +LAMMPS. As such, the PyLammps version of this tutorial has been removed and is +replaced by the :doc:`Howto_python`. diff --git a/doc/src/Howto_python.rst b/doc/src/Howto_python.rst new file mode 100644 index 0000000000..f668532f44 --- /dev/null +++ b/doc/src/Howto_python.rst @@ -0,0 +1,488 @@ +LAMMPS Python Tutorial +====================== + +.. contents:: + +Overview +-------- + +:py:class:`lammps ` is a Python wrapper class for the +LAMMPS :ref:`C language library interface API ` which is written using +`Python ctypes `_. + +In addition to the flat `ctypes `_ interface, this class exposes a +discoverable API that doesn't require knowledge of the underlying C++ +code implementation. + +Finally, the API exposes some additional features for `IPython integration +`_ into `Jupyter notebooks `_, e.g. for embedded +visualization output from :doc:`dump style image `. + +.. _ctypes: https://docs.python.org/3/library/ctypes.html +.. _ipython: https://ipython.org/ +.. _jupyter: https://jupyter.org/ + +Quick Start +----------- + +System-wide Installation +^^^^^^^^^^^^^^^^^^^^^^^^ + +Step 1: Building LAMMPS as a shared library +""""""""""""""""""""""""""""""""""""""""""" + +To use LAMMPS inside of Python it has to be compiled as shared +library. This library is then loaded by the Python interface. In this +example we enable the MOLECULE package and compile LAMMPS with PNG, JPEG +and FFMPEG output support enabled. + +Step 1a: For the CMake based build system, the steps are: + +.. code-block:: bash + + mkdir $LAMMPS_DIR/build-shared + cd $LAMMPS_DIR/build-shared + + # MPI, PNG, Jpeg, FFMPEG are auto-detected + cmake ../cmake -DPKG_MOLECULE=yes -DPKG_PYTHON=on -DBUILD_SHARED_LIBS=yes + make + +Step 1b: For the legacy, make based build system, the steps are: + +.. code-block:: bash + + cd $LAMMPS_DIR/src + + # add packages if necessary + make yes-MOLECULE + make yes-PYTHON + + # compile shared library using Makefile + make mpi mode=shlib LMP_INC="-DLAMMPS_PNG -DLAMMPS_JPEG -DLAMMPS_FFMPEG" JPG_LIB="-lpng -ljpeg" + +Step 2: Installing the LAMMPS Python package +"""""""""""""""""""""""""""""""""""""""""""" + +Next install the LAMMPS Python package into your current Python installation with: + +.. code-block:: bash + + make install-python + +.. note:: + + Recompiling the shared library requires re-installing the Python package + +Installation inside of a virtual environment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use virtual environemnts to create a custom Python environment +specifically tuned for your workflow. + +Benefits of using a virtualenv +"""""""""""""""""""""""""""""" + +* isolation of your system Python installation from your development installation +* installation can happen in your user directory without root access (useful for HPC clusters) +* installing packages through pip allows you to get newer versions of packages than e.g., through apt-get or yum package managers (and without root access) +* you can even install specific old versions of a package if necessary + +**Prerequisite (e.g. on Ubuntu)** + +.. code-block:: bash + + apt-get install python-venv + +Creating a virtualenv with lammps installed +""""""""""""""""""""""""""""""""""""""""""" + +.. code-block:: bash + + # create virtual envrionment named 'testing' + python3 -m venv $HOME/python/testing + + # activate 'testing' environment + source $HOME/python/testing/bin/activate + +Now configure and compile the LAMMPS shared library as outlined above. +When using CMake and the shared library has already been build, you +need to re-run CMake to update the location of the python executable +to the location in the virtual environment with: + +.. code-block:: bash + + cmake . -DPython_EXECUTABLE=$(which python) + + # install LAMMPS package in virtualenv + (testing) make install-python + + # install other useful packages + (testing) pip install matplotlib jupyter mpi4py + + ... + + # return to original shell + (testing) deactivate + +Creating a new lammps instance +------------------------------ + +To create a lammps object you need to first import the class from the lammps +module. By using the default constructor, a new :py:class:`lammps +` instance is created. + +.. code-block:: python + + from lammps import lammps + L = lammps() + +Commands +-------- + +Sending a LAMMPS command with the library interface is done using +the ``command`` method of the lammps object. + +For instance, let's take the following LAMMPS command: + +.. code-block:: LAMMPS + + region box block 0 10 0 5 -0.5 0.5 + +This command can be executed with the following Python code if ``L`` is a ``lammps`` +instance: + +.. code-block:: python + + L.command("region box block 0 10 0 5 -0.5 0.5") + +For convenience, the ``lammps`` class also provides a command wrapper ``cmd`` +that turns any LAMMPS command into a regular function call: + +.. code-block:: python + + L.cmd.region("box block", 0, 10, 0, 5, -0.5, 0.5) + +Note that each parameter is set as Python number literal. With +the wrapper each command takes an arbitrary parameter list and transparently +merges it to a single command string, separating individual parameters by +white-space. + +The benefit of this approach is avoiding redundant command calls and easier +parameterization. With the ``command`` function each call needs to be assembled +manually using formatted strings. + +.. code-block:: python + + L.command(f"region box block {xlo} {xhi} {ylo} {yhi} {zlo} {zhi}") + +The wrapper accepts parameters directly and will convert +them automatically to a final command string. + +.. code-block:: python + + L.cmd.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) + +.. note:: + + When running in IPython you can use Tab-completion after ``L.cmd.`` to see + all available LAMMPS commands. + +System state +------------ + +In addition to dispatching commands directly through the PyLammps object, it +also provides several properties which allow you to query the system state. + +L.system + Is a dictionary describing the system such as the bounding box or number of atoms + +L.system.xlo, L.system.xhi + bounding box limits along x-axis + +L.system.ylo, L.system.yhi + bounding box limits along y-axis + +L.system.zlo, L.system.zhi + bounding box limits along z-axis + +L.communication + configuration of communication subsystem, such as the number of threads or processors + +L.communication.nthreads + number of threads used by each LAMMPS process + +L.communication.nprocs + number of MPI processes used by LAMMPS + +L.fixes + List of fixes in the current system + +L.computes + List of active computes in the current system + +L.dump + List of active dumps in the current system + +L.groups + List of groups present in the current system + +Working with LAMMPS variables +----------------------------- + +LAMMPS variables can be both defined and accessed via the PyLammps interface. + +To define a variable you can use the :doc:`variable ` command: + +.. code-block:: python + + L.variable("a index 2") + +A dictionary of all variables is returned by L.variables + +you can access an individual variable by retrieving a variable object from the +``L.variables`` dictionary by name + +.. code-block:: python + + a = L.variables['a'] + +The variable value can then be easily read and written by accessing the value +property of this object. + +.. code-block:: python + + print(a.value) + a.value = 4 + +Retrieving the value of an arbitrary LAMMPS expressions +------------------------------------------------------- + +LAMMPS expressions can be immediately evaluated by using the eval method. The +passed string parameter can be any expression containing global thermo values, +variables, compute or fix data. + +.. code-block:: python + + result = L.get_thermo("ke") # kinetic energy + result = L.get_thermo("pe") # potential energy + + result = L.extract_variable("t") / 2.0 + +Accessing atom data +------------------- + +All atoms in the current simulation can be accessed by using the L.atoms list. +Each element of this list is an object which exposes its properties (id, type, +position, velocity, force, etc.). + +.. code-block:: python + + # access first atom + atom_id = L.numpy.extract_atom("id") + atom_type = L.numpy.extract_atom("type") + + x = L.numpy.extract_atom("x") + v = L.numpy.extract_atom("v") + f = L.numpy.extract_atom("f") + +Some properties can also be used to set: + +.. code-block:: python + + # set position in 2D simulation + x[0] = (1.0, 0.0) + + # set position in 3D simulation + x[0] = (1.0, 0.0, 1.) + +Evaluating thermo data +---------------------- + +Each simulation run usually produces thermo output based on system state, +computes, fixes or variables. The trajectories of these values can be queried +after a run via the L.runs list. This list contains a growing list of run data. +The first element is the output of the first run, the second element that of +the second run. + +.. code-block:: python + + L.run(1000) + L.runs[0] # data of first 1000 time steps + + L.run(1000) + L.runs[1] # data of second 1000 time steps + +Each run contains a dictionary of all trajectories. Each trajectory is +accessible through its thermo name: + +.. code-block:: python + + L.runs[0].thermo.Step # list of time steps in first run + L.runs[0].thermo.Ke # list of kinetic energy values in first run + +Together with matplotlib plotting data out of LAMMPS becomes simple: + +.. code-block:: python + + import matplotlib.plot as plt + steps = L.runs[0].thermo.Step + ke = L.runs[0].thermo.Ke + plt.plot(steps, ke) + +Error handling with PyLammps +---------------------------- + +Using C++ exceptions in LAMMPS for errors allows capturing them on the +C++ side and rethrowing them on the Python side. This way you can handle +LAMMPS errors through the Python exception handling mechanism. + +.. warning:: + + Capturing a LAMMPS exception in Python can still mean that the + current LAMMPS process is in an illegal state and must be + terminated. It is advised to save your data and terminate the Python + instance as quickly as possible. + +Using LAMMPS in IPython notebooks and Jupyter +--------------------------------------------- + +If the LAMMPS Python package is installed for the same Python interpreter as +IPython, you can use LAMMPS directly inside of an IPython notebook inside of +Jupyter. Jupyter is a powerful integrated development environment (IDE) for +many dynamic languages like Python, Julia and others, which operates inside of +any web browser. Besides auto-completion and syntax highlighting it allows you +to create formatted documents using Markup, mathematical formulas, graphics and +animations intermixed with executable Python code. It is a great format for +tutorials and showcasing your latest research. + +To launch an instance of Jupyter simply run the following command inside your +Python environment (this assumes you followed the Quick Start instructions): + +.. code-block:: bash + + jupyter notebook + +Interactive Python Examples +--------------------------- + +Examples of IPython notebooks can be found in the ``python/examples/juypter`` +subdirectory. To open these notebooks launch ``jupyter notebook`` inside this +directory and navigate to one of them. If you compiled and installed +a LAMMPS shared library with PNG, JPEG and FFMPEG support +you should be able to rerun all of these notebooks. + +Validating a dihedral potential +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This example showcases how an IPython Notebook can be used to compare a simple +LAMMPS simulation of a harmonic dihedral potential to its analytical solution. +Four atoms are placed in the simulation and the dihedral potential is applied on +them using a datafile. Then one of the atoms is rotated along the central axis by +setting its position from Python, which changes the dihedral angle. + +.. code-block:: python + + phi = [d \* math.pi / 180 for d in range(360)] + + pos = [(1.0, math.cos(p), math.sin(p)) for p in phi] + + x = L.numpy.extract_atom("x") + + pe = [] + for p in pos: + x[3] = p + L.cmd.run(0) + pe.append(L.get_thermo("pe")) + +By evaluating the potential energy for each position we can verify that +trajectory with the analytical formula. To compare both solutions, we plot +both trajectories over each other using matplotlib, which embeds the generated +plot inside the IPython notebook. + +.. image:: JPG/pylammps_dihedral.jpg + :align: center + +Running a Monte Carlo relaxation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This second example shows how to use the `lammps` Python interface to create a +2D Monte Carlo Relaxation simulation, computing and plotting energy terms and +even embedding video output. + +Initially, a 2D system is created in a state with minimal energy. + +.. image:: JPG/pylammps_mc_minimum.jpg + :align: center + +It is then disordered by moving each atom by a random delta. + +.. code-block:: python + + random.seed(27848) + deltaperturb = 0.2 + x = L.numpy.extract_atom("x") + natoms = x.shape[0] + + for i in range(natoms): + dx = deltaperturb \* random.uniform(-1, 1) + dy = deltaperturb \* random.uniform(-1, 1) + x[i][0] += dx + x[i][1] += dy + + L.cmd.run(0) + +.. image:: JPG/pylammps_mc_disordered.jpg + :align: center + +Finally, the Monte Carlo algorithm is implemented in Python. It continuously +moves random atoms by a random delta and only accepts certain moves. + +.. code-block:: python + + estart = L.get_thermo("pe") + elast = estart + + naccept = 0 + energies = [estart] + + niterations = 3000 + deltamove = 0.1 + kT = 0.05 + + for i in range(niterations): + x = L.numpy.extract_atom("x") + natoms = x.shape[0] + iatom = random.randrange(0, natoms) + current_atom = x[iatom] + + x0 = current_atom[0] + y0 = current_atom[1] + + dx = deltamove \* random.uniform(-1, 1) + dy = deltamove \* random.uniform(-1, 1) + + current_atom[0] = x0 + dx + current_atom[1] = y0 + dy + + L.cmd.run(1, "pre no post no") + + e = L.get_thermo("pe") + energies.append(e) + + if e <= elast: + naccept += 1 + elast = e + elif random.random() <= math.exp(natoms\*(elast-e)/kT): + naccept += 1 + elast = e + else: + current_atom[0] = x0 + current_atom[1] = y0 + +The energies of each iteration are collected in a Python list and finally plotted using matplotlib. + +.. image:: JPG/pylammps_mc_energies_plot.jpg + :align: center + +The IPython notebook also shows how to use dump commands and embed video files +inside of the IPython notebook. diff --git a/doc/src/Library.rst b/doc/src/Library.rst index 50c28b7fcd..e2021187c2 100644 --- a/doc/src/Library.rst +++ b/doc/src/Library.rst @@ -131,16 +131,15 @@ run LAMMPS in serial mode. .. _lammps_python_api: -LAMMPS Python APIs -================== +LAMMPS Python API +================= The LAMMPS Python module enables calling the LAMMPS C library API from Python by dynamically loading functions in the LAMMPS shared library through the `Python ctypes module `_. Because of the dynamic loading, it is **required** that LAMMPS is compiled in :ref:`"shared" mode `. The Python interface is object-oriented, but -otherwise tries to be very similar to the C library API. Three different -Python classes to run LAMMPS are available and they build on each other. +otherwise tries to be very similar to the C library API. More information on this is in the :doc:`Python_head` section of the manual. Use of the LAMMPS Python module is described in :doc:`Python_module`. diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst index f01559a524..0a445f9b6b 100644 --- a/doc/src/Python_atoms.rst +++ b/doc/src/Python_atoms.rst @@ -2,14 +2,8 @@ Per-atom properties =================== Similar to what is described in :doc:`Library_atoms`, the instances of -:py:class:`lammps `, :py:class:`PyLammps `, or -:py:class:`IPyLammps ` can be used to extract atom quantities -and modify some of them. The main difference between the interfaces is how the information -is exposed. - -While the :py:class:`lammps ` is just a thin layer that wraps C API calls, -:py:class:`PyLammps ` and :py:class:`IPyLammps ` expose -information as objects and properties. +:py:class:`lammps ` can be used to extract atom quantities +and modify some of them. In some cases the data returned is a direct reference to the original data inside LAMMPS cast to ``ctypes`` pointers. Where possible, the wrappers will @@ -25,57 +19,25 @@ against invalid accesses. accordingly. These arrays can change sizes and order at every neighbor list rebuild and atom sort event as atoms are migrating between subdomains. -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + lmp = lammps() + lmp.file("in.sysinit") - from lammps import lammps + nlocal = lmp.extract_global("nlocal") + x = lmp.extract_atom("x") - lmp = lammps() - lmp.file("in.sysinit") + for i in range(nlocal): + print("(x,y,z) = (", x[i][0], x[i][1], x[i][2], ")") - nlocal = lmp.extract_global("nlocal") - x = lmp.extract_atom("x") + lmp.close() - for i in range(nlocal): - print("(x,y,z) = (", x[i][0], x[i][1], x[i][2], ")") +**Methods**: - lmp.close() +* :py:meth:`extract_atom() `: extract a per-atom quantity - **Methods**: - - * :py:meth:`extract_atom() `: extract a per-atom quantity - - **Numpy Methods**: - - * :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array - - .. tab:: PyLammps/IPyLammps API - - All atoms in the current simulation can be accessed by using the :py:attr:`PyLammps.atoms ` property. - Each element of this list is a :py:class:`Atom ` or :py:class:`Atom2D ` object. The attributes of - these objects provide access to their data (id, type, position, velocity, force, etc.): - - .. code-block:: python - - # access first atom - L.atoms[0].id - L.atoms[0].type - - # access second atom - L.atoms[1].position - L.atoms[1].velocity - L.atoms[1].force - - Some attributes can be changed: - - .. code-block:: python - - # set position in 2D simulation - L.atoms[0].position = (1.0, 0.0) - - # set position in 3D simulation - L.atoms[0].position = (1.0, 0.0, 1.0) +**Numpy Methods**: +* :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array diff --git a/doc/src/Python_create.rst b/doc/src/Python_create.rst index 939aad2f32..c1444c400e 100644 --- a/doc/src/Python_create.rst +++ b/doc/src/Python_create.rst @@ -26,108 +26,25 @@ to run the Python module like the library interface on a subset of the MPI ranks after splitting the communicator. -Here are simple examples using all three Python interfaces: +Here is a simple example using the LAMMPS Python interface: -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + # NOTE: argv[0] is set by the lammps class constructor + args = ["-log", "none"] - from lammps import lammps + # create LAMMPS instance + lmp = lammps(cmdargs=args) - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] + # get and print numerical version code + print("LAMMPS Version: ", lmp.version()) - # create LAMMPS instance - lmp = lammps(cmdargs=args) + # explicitly close and delete LAMMPS instance (optional) + lmp.close() - # get and print numerical version code - print("LAMMPS Version: ", lmp.version()) - - # explicitly close and delete LAMMPS instance (optional) - lmp.close() - - .. tab:: PyLammps API - - The :py:class:`PyLammps ` class is a wrapper around the - :py:class:`lammps ` class and all of its lower level functions. - By default, it will create a new instance of :py:class:`lammps ` passing - along all arguments to the constructor of :py:class:`lammps `. - - .. code-block:: python - - from lammps import PyLammps - - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] - - # create LAMMPS instance - L = PyLammps(cmdargs=args) - - # get and print numerical version code - print("LAMMPS Version: ", L.version()) - - # explicitly close and delete LAMMPS instance (optional) - L.close() - - :py:class:`PyLammps ` objects can also be created on top of an existing - :py:class:`lammps ` object: - - .. code-block:: python - - from lammps import lammps, PyLammps - ... - # create LAMMPS instance - lmp = lammps(cmdargs=args) - - # create PyLammps instance using previously created LAMMPS instance - L = PyLammps(ptr=lmp) - - This is useful if you have to create the :py:class:`lammps ` - instance is a specific way, but want to take advantage of the - :py:class:`PyLammps ` interface. - - .. tab:: IPyLammps API - - The :py:class:`IPyLammps ` class is an extension of the - :py:class:`PyLammps ` class. It has the same construction behavior. By - default, it will create a new instance of :py:class:`lammps` passing - along all arguments to the constructor of :py:class:`lammps`. - - .. code-block:: python - - from lammps import IPyLammps - - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] - - # create LAMMPS instance - L = IPyLammps(cmdargs=args) - - # get and print numerical version code - print("LAMMPS Version: ", L.version()) - - # explicitly close and delete LAMMPS instance (optional) - L.close() - - You can also initialize IPyLammps on top of an existing :py:class:`lammps` or :py:class:`PyLammps` object: - - .. code-block:: python - - from lammps import lammps, IPyLammps - ... - # create LAMMPS instance - lmp = lammps(cmdargs=args) - - # create PyLammps instance using previously created LAMMPS instance - L = PyLammps(ptr=lmp) - - This is useful if you have to create the :py:class:`lammps ` - instance is a specific way, but want to take advantage of the - :py:class:`IPyLammps ` interface. - -In all of the above cases, same as with the :ref:`C library API `, this will use the +Same as with the :ref:`C library API `, this will use the ``MPI_COMM_WORLD`` communicator for the MPI library that LAMMPS was compiled with. diff --git a/doc/src/Python_execute.rst b/doc/src/Python_execute.rst index 01cf0e920f..28c3ff5575 100644 --- a/doc/src/Python_execute.rst +++ b/doc/src/Python_execute.rst @@ -1,127 +1,119 @@ Executing commands ================== -Once an instance of the :py:class:`lammps `, -:py:class:`PyLammps `, or -:py:class:`IPyLammps ` class is created, there are +Once an instance of the :py:class:`lammps ` class is created, there are multiple ways to "feed" it commands. In a way that is not very different from running a LAMMPS input script, except that Python has many more facilities for structured programming than the LAMMPS input script syntax. Furthermore it is possible to "compute" what the next LAMMPS command should be. -.. tabs:: +Same as in the equivalent :doc:`C library functions `, +commands can be read from a file, a single string, a list of strings and a +block of commands in a single multi-line string. They are processed under the +same boundary conditions as the C library counterparts. The example below +demonstrates the use of :py:func:`lammps.file()`, :py:func:`lammps.command()`, +:py:func:`lammps.commands_list()`, and :py:func:`lammps.commands_string()`: - .. tab:: lammps API +.. code-block:: python - Same as in the equivalent - :doc:`C library functions `, commands can be read from a file, a - single string, a list of strings and a block of commands in a single - multi-line string. They are processed under the same boundary conditions - as the C library counterparts. The example below demonstrates the use - of :py:func:`lammps.file()`, :py:func:`lammps.command()`, - :py:func:`lammps.commands_list()`, and :py:func:`lammps.commands_string()`: + from lammps import lammps + lmp = lammps() - .. code-block:: python + # read commands from file 'in.melt' + lmp.file('in.melt') - from lammps import lammps - lmp = lammps() + # issue a single command + lmp.command('variable zpos index 1.0') - # read commands from file 'in.melt' - lmp.file('in.melt') + # create 10 groups with 10 atoms each + cmds = ["group g{} id {}:{}".format(i,10*i+1,10*(i+1)) for i in range(10)] + lmp.commands_list(cmds) - # issue a single command - lmp.command('variable zpos index 1.0') + # run commands from a multi-line string + block = """ + clear + region box block 0 2 0 2 0 2 + create_box 1 box + create_atoms 1 single 1.0 1.0 ${zpos} + """ + lmp.commands_string(block) - # create 10 groups with 10 atoms each - cmds = ["group g{} id {}:{}".format(i,10*i+1,10*(i+1)) for i in range(10)] - lmp.commands_list(cmds) + +Unlike the lammps API, the PyLammps/IPyLammps APIs allow running LAMMPS +commands by calling equivalent member functions of :py:class:`PyLammps ` +and :py:class:`IPyLammps ` instances. - # run commands from a multi-line string - block = """ - clear - region box block 0 2 0 2 0 2 - create_box 1 box - create_atoms 1 single 1.0 1.0 ${zpos} - """ - lmp.commands_string(block) +For instance, the following LAMMPS command - .. tab:: PyLammps/IPyLammps API +.. code-block:: LAMMPS - Unlike the lammps API, the PyLammps/IPyLammps APIs allow running LAMMPS - commands by calling equivalent member functions of :py:class:`PyLammps ` - and :py:class:`IPyLammps ` instances. + region box block 0 10 0 5 -0.5 0.5 - For instance, the following LAMMPS command +can be executed using with the lammps API with the following Python code if ``lmp`` is an +instance of :py:class:`lammps `: - .. code-block:: LAMMPS +.. code-block:: python - region box block 0 10 0 5 -0.5 0.5 + from lammps import lammps - can be executed using with the lammps API with the following Python code if ``lmp`` is an - instance of :py:class:`lammps `: + lmp = lammps() + lmp.command("region box block 0 10 0 5 -0.5 0.5") - .. code-block:: python +With the PyLammps interface, any LAMMPS command can be split up into arbitrary parts. +These parts are then passed to a member function with the name of the :doc:`command `. +For the :doc:`region ` command that means the :code:`region()` method can be called. +The arguments of the command can be passed as one string, or +individually. - from lammps import lammps +.. code-block:: python - lmp = lammps() - lmp.command("region box block 0 10 0 5 -0.5 0.5") + from lammps import lammps - With the PyLammps interface, any LAMMPS command can be split up into arbitrary parts. - These parts are then passed to a member function with the name of the :doc:`command `. - For the :doc:`region ` command that means the :code:`region()` method can be called. - The arguments of the command can be passed as one string, or - individually. + L = lammps() - .. code-block:: python + # pass command parameters as one string + L.cmd.region("box block 0 10 0 5 -0.5 0.5") - from lammps import PyLammps + # OR pass them individually + L.cmd.region("box block", 0, 10, 0, 5, -0.5, 0.5) - L = PyLammps() +In the latter example, all parameters except the first are Python floating-point literals. The +member function takes the entire parameter list and transparently merges it to a single command +string. - # pass command parameters as one string - L.region("box block 0 10 0 5 -0.5 0.5") +The benefit of this approach is avoiding redundant command calls and easier +parameterization. In the lammps API parameterization needed to be done +manually by creating formatted command strings. - # OR pass them individually - L.region("box block", 0, 10, 0, 5, -0.5, 0.5) +.. code-block:: python - In the latter example, all parameters except the first are Python floating-point literals. The - member function takes the entire parameter list and transparently merges it to a single command - string. + lmp.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) - The benefit of this approach is avoiding redundant command calls and easier - parameterization. In the lammps API parameterization needed to be done - manually by creating formatted command strings. +In contrast, methods of PyLammps accept parameters directly and will convert +them automatically to a final command string. - .. code-block:: python +.. code-block:: python - lmp.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) + L.cmd.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) - In contrast, methods of PyLammps accept parameters directly and will convert - them automatically to a final command string. +Using these facilities, the previous example shown above can be rewritten as follows: - .. code-block:: python +.. code-block:: python - L.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) + from lammps import PyLammps + L = lammps() - Using these facilities, the example shown for the lammps API can be rewritten as follows: + # read commands from file 'in.melt' + L.file('in.melt') - .. code-block:: python + # issue a single command + L.cmd.variable('zpos', 'index', 1.0) - from lammps import PyLammps - L = PyLammps() + # create 10 groups with 10 atoms each + for i in range(10): + L.cmd.group(f"g{i}", "id", f"{10*i+1}:{10*(i+1)}") - # read commands from file 'in.melt' - L.file('in.melt') - - # issue a single command - L.variable('zpos', 'index', 1.0) - - # create 10 groups with 10 atoms each - for i in range(10): - L.group(f"g{i}", "id", f"{10*i+1}:{10*(i+1)}") - - L.clear() - L.region("box block", 0, 2, 0, 2, 0, 2) - L.create_box(1, "box") - L.create_atoms(1, "single", 1.0, 1.0, "${zpos}") + L.cmd.clear() + L.cmd.region("box block", 0, 2, 0, 2, 0, 2) + L.cmd.create_box(1, "box") + L.cmd.create_atoms(1, "single", 1.0, 1.0, "${zpos}") diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst index c19d4b0345..9c60982e1b 100644 --- a/doc/src/Python_module.rst +++ b/doc/src/Python_module.rst @@ -10,19 +10,13 @@ be installed into a Python system folder or a user folder with ``make install-python``. Components of the module can then loaded into a Python session with the ``import`` command. -There are multiple Python interface classes in the :py:mod:`lammps` module: +.. warning:: -- the :py:class:`lammps ` class. This is a wrapper around - the C-library interface and its member functions try to replicate the - :ref:`C-library API ` closely. This is the most - feature-complete Python API. -- the :py:class:`PyLammps ` class. This is a more high-level - and more Python style class implemented on top of the - :py:class:`lammps ` class. -- the :py:class:`IPyLammps ` class is derived from - :py:class:`PyLammps ` and adds embedded graphics - features to conveniently include LAMMPS into `Jupyter - `_ notebooks. + Alternative interfaces such as :py:class:`PyLammps ` and + :py:class:`IPyLammps ` classes have been deprecated and + will be removed in a future version of LAMMPS. The :doc:`Howto_pylammps` has + also been replaced by a reworked :doc:`Howto_python` that showcases how to + use the modern Python API facilities instead. .. _mpi4py_url: https://mpi4py.readthedocs.io @@ -49,7 +43,7 @@ The ``lammps`` class API ======================== The :py:class:`lammps ` class is the core of the LAMMPS -Python interfaces. It is a wrapper around the :ref:`LAMMPS C library +Python interface. It is a wrapper around the :ref:`LAMMPS C library API ` using the `Python ctypes module `_ and a shared library compiled from the LAMMPS sources code. The individual methods in this @@ -64,40 +58,7 @@ functions. Below is a detailed documentation of the API. .. autoclass:: lammps.numpy_wrapper::numpy_wrapper :members: ----------- - -The ``PyLammps`` class API -========================== - -The :py:class:`PyLammps ` class is a wrapper that creates a -simpler, more "Pythonic" interface to common LAMMPS functionality. LAMMPS -data structures are exposed through objects and properties. This makes Python -scripts shorter and more concise. See the :doc:`PyLammps Tutorial -` for an introduction on how to use this interface. - -.. autoclass:: lammps.PyLammps - :members: - -.. autoclass:: lammps.AtomList - :members: - -.. autoclass:: lammps.Atom - :members: - -.. autoclass:: lammps.Atom2D - :members: - ----------- - -The ``IPyLammps`` class API -=========================== - -The :py:class:`IPyLammps ` class is an extension of -:py:class:`PyLammps `, adding additional functions to -quickly display visualizations such as images and videos inside of IPython. -See the :doc:`PyLammps Tutorial ` for examples. - -.. autoclass:: lammps.IPyLammps +.. autoclass:: lammps.ipython::wrapper :members: ---------- diff --git a/doc/src/Python_overview.rst b/doc/src/Python_overview.rst index a13da0d512..85bc0d3bfa 100644 --- a/doc/src/Python_overview.rst +++ b/doc/src/Python_overview.rst @@ -56,7 +56,7 @@ Below is an example output for Python version 3.8.5. --------- -LAMMPS can work together with Python in three ways. First, Python can +LAMMPS can work together with Python in two ways. First, Python can wrap LAMMPS through the its :doc:`library interface `, so that a Python script can create one or more instances of LAMMPS and launch one or more simulations. In Python terms, this is referred to as @@ -67,22 +67,7 @@ launch one or more simulations. In Python terms, this is referred to as Launching LAMMPS via Python - -Second, the lower-level Python interface in the :py:class:`lammps Python -class ` can be used indirectly through the provided -:py:class:`PyLammps ` and :py:class:`IPyLammps -` wrapper classes, also written in Python. These -wrappers try to simplify the usage of LAMMPS in Python by providing a -more object-based interface to common LAMMPS functionality. They also -reduce the amount of code necessary to parameterize LAMMPS scripts -through Python and make variables and computes directly accessible. - -.. figure:: JPG/pylammps-invoke-lammps.png - :figclass: align-center - - Using the PyLammps / IPyLammps wrappers - -Third, LAMMPS can use the Python interpreter, so that a LAMMPS input +Second, LAMMPS can use the Python interpreter, so that a LAMMPS input script or styles can invoke Python code directly, and pass information back-and-forth between the input script and Python functions you write. This Python code can also call back to LAMMPS to query or change its diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt index acf575fe58..d842f47c11 100644 --- a/doc/utils/requirements.txt +++ b/doc/utils/requirements.txt @@ -9,3 +9,4 @@ Pygments six pyyaml linkchecker +ipython diff --git a/python/examples/pylammps/.gitignore b/python/examples/ipython/.gitignore similarity index 100% rename from python/examples/pylammps/.gitignore rename to python/examples/ipython/.gitignore diff --git a/python/examples/pylammps/README.md b/python/examples/ipython/README.md similarity index 80% rename from python/examples/pylammps/README.md rename to python/examples/ipython/README.md index e66f5a2a8e..ef8356fc6a 100644 --- a/python/examples/pylammps/README.md +++ b/python/examples/ipython/README.md @@ -1,10 +1,10 @@ -# PyLammps and Jupyter Notebooks +# IPython and Jupyter Notebooks -This folder contains examples showcasing the usage of the PyLammps Python +This folder contains examples showcasing the usage of the LAMMPS Python interface and Jupyter notebooks. To use this you will need LAMMPS compiled as a shared library and the LAMMPS Python package installed. -An extensive guide on how to achieve this is documented in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a [PyLammps tutorial](https://docs.lammps.org/Howto_pylammps.html). +An extensive guide on how to achieve this is documented in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a [LAMMPS Python tutorial](https://docs.lammps.org/Howto_python.html). The following will show one way of creating a Python virtual environment which has both LAMMPS and its Python package installed: @@ -53,7 +53,7 @@ which has both LAMMPS and its Python package installed: ```shell (myenv)$ cmake -C ../cmake/presets/basic.cmake \ -D BUILD_SHARED_LIBS=on \ - -D LAMMPS_EXCEPTIONS=on -D PKG_PYTHON=on \ + -D PKG_PYTHON=on \ -D CMAKE_INSTALL_PREFIX=$VIRTUAL_ENV \ ../cmake ``` @@ -67,19 +67,19 @@ which has both LAMMPS and its Python package installed: 8. Install LAMMPS and Python package into virtual environment ```shell - (myenv)$ cmake --install . + (myenv)$ make install-python ``` 9. Install other Python packages into virtual environment ```shell - (myenv)$ pip install jupyter matplotlib mpi4py + (myenv)$ pip install jupyter matplotlib pandas mpi4py ``` -10. Navigate to pylammps examples folder +10. Navigate to ipython examples folder ```shell - (myenv)$ cd ../python/examples/pylammmps + (myenv)$ cd ../python/examples/ipython ``` 11. Launch Jupyter and work inside browser diff --git a/python/examples/pylammps/interface_usage_bonds.ipynb b/python/examples/ipython/atoms.ipynb similarity index 74% rename from python/examples/pylammps/interface_usage_bonds.ipynb rename to python/examples/ipython/atoms.ipynb index 0203ceb5c4..14b60d4e28 100644 --- a/python/examples/pylammps/interface_usage_bonds.ipynb +++ b/python/examples/ipython/atoms.ipynb @@ -4,16 +4,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example 3: 2D circle of particles inside of box with LJ walls" + "# Example 3: Example 3: Using Atom Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Prerequisites\n", - "\n", - "Before running this example, make sure your Python environment can find the LAMMPS shared library (`liblammps.so`) and the LAMMPS Python package is installed. If you followed the [README](README.md) in this folder, this should already be the case. You can also find more information about how to compile LAMMPS and install the LAMMPS Python package in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a dedicated [PyLammps HowTo](https://docs.lammps.org/Howto_pylammps.html)." + "Author: [Richard Berger](mailto:richard.berger@outlook.com)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2D circle of particles inside of box with LJ walls" ] }, { @@ -29,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lammps import IPyLammps" + "from lammps import lammps" ] }, { @@ -38,7 +43,8 @@ "metadata": {}, "outputs": [], "source": [ - "L = IPyLammps()" + "L = lammps()\n", + "cmd = L.cmd" ] }, { @@ -60,36 +66,36 @@ "v = 0.3\n", "w = 0.08\n", " \n", - "L.units(\"lj\")\n", - "L.dimension(2)\n", - "L.atom_style(\"bond\")\n", - "L.boundary(\"f f p\")\n", + "cmd.units(\"lj\")\n", + "cmd.dimension(2)\n", + "cmd.atom_style(\"bond\")\n", + "cmd.boundary(\"f f p\")\n", "\n", - "L.lattice(\"hex\", 0.85)\n", - "L.region(\"box\", \"block\", 0, x, 0, y, -0.5, 0.5)\n", - "L.create_box(1, \"box\", \"bond/types\", 1, \"extra/bond/per/atom\", 6)\n", - "L.region(\"circle\", \"sphere\", d/2.0+1.0, d/2.0/math.sqrt(3.0)+1, 0.0, d/2.0)\n", - "L.create_atoms(1, \"region\", \"circle\")\n", - "L.mass(1, 1.0)\n", + "cmd.lattice(\"hex\", 0.85)\n", + "cmd.region(\"box\", \"block\", 0, x, 0, y, -0.5, 0.5)\n", + "cmd.create_box(1, \"box\", \"bond/types\", 1, \"extra/bond/per/atom\", 6)\n", + "cmd.region(\"circle\", \"sphere\", d/2.0+1.0, d/2.0/math.sqrt(3.0)+1, 0.0, d/2.0)\n", + "cmd.create_atoms(1, \"region\", \"circle\")\n", + "cmd.mass(1, 1.0)\n", "\n", - "L.velocity(\"all create 0.5 87287 loop geom\")\n", - "L.velocity(\"all set\", v, w, 0, \"sum yes\")\n", + "cmd.velocity(\"all create 0.5 87287 loop geom\")\n", + "cmd.velocity(\"all set\", v, w, 0, \"sum yes\")\n", "\n", - "L.pair_style(\"lj/cut\", 2.5)\n", - "L.pair_coeff(1, 1, 10.0, 1.0, 2.5)\n", + "cmd.pair_style(\"lj/cut\", 2.5)\n", + "cmd.pair_coeff(1, 1, 10.0, 1.0, 2.5)\n", "\n", - "L.bond_style(\"harmonic\")\n", - "L.bond_coeff(1, 10.0, 1.2)\n", + "cmd.bond_style(\"harmonic\")\n", + "cmd.bond_coeff(1, 10.0, 1.2)\n", "\n", - "L.create_bonds(\"many\", \"all\", \"all\", 1, 1.0, 1.5)\n", + "cmd.create_bonds(\"many\", \"all\", \"all\", 1, 1.0, 1.5)\n", "\n", - "L.neighbor(0.3, \"bin\")\n", - "L.neigh_modify(\"delay\", 0, \"every\", 1, \"check yes\")\n", + "cmd.neighbor(0.3, \"bin\")\n", + "cmd.neigh_modify(\"delay\", 0, \"every\", 1, \"check yes\")\n", "\n", - "L.fix(1, \"all\", \"nve\")\n", + "cmd.fix(1, \"all\", \"nve\")\n", "\n", - "L.fix(2, \"all wall/lj93 xlo 0.0 1 1 2.5 xhi\", x, \"1 1 2.5\")\n", - "L.fix(3, \"all wall/lj93 ylo 0.0 1 1 2.5 yhi\", y, \"1 1 2.5\")" + "cmd.fix(2, \"all wall/lj93 xlo 0.0 1 1 2.5 xhi\", x, \"1 1 2.5\")\n", + "cmd.fix(3, \"all wall/lj93 ylo 0.0 1 1 2.5 yhi\", y, \"1 1 2.5\")" ] }, { @@ -105,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.8)" + "L.ipython.image(zoom=1.8)" ] }, { @@ -121,10 +127,10 @@ "metadata": {}, "outputs": [], "source": [ - "L.thermo_style(\"custom step temp epair press\")\n", - "L.thermo(100)\n", - "output = L.run(40000)\n", - "L.image(zoom=1.8)" + "cmd.thermo_style(\"custom step temp epair press\")\n", + "cmd.thermo(100)\n", + "output = cmd.run(40000)\n", + "L.ipython.image(zoom=1.8)" ] }, { @@ -366,7 +372,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.eval(\"ke\")" + "L.expand(\"ke\")" ] }, { @@ -382,7 +388,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[0]" + "L.numpy.extract_atom(\"x\")" ] }, { @@ -391,7 +397,7 @@ "metadata": {}, "outputs": [], "source": [ - "dir(L.atoms[0])" + "L.numpy.extract_atom(\"id\")" ] }, { @@ -400,7 +406,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[0].position" + "L.numpy.extract_atom(\"v\")" ] }, { @@ -409,7 +415,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[0].id" + "L.numpy.extract_atom(\"f\")" ] }, { @@ -418,25 +424,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[0].velocity" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].force" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].type" + "L.numpy.extract_atom(\"type\")" ] }, { @@ -449,7 +437,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -463,9 +451,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/python/examples/pylammps/dihedrals/data.dihedral b/python/examples/ipython/dihedrals/data.dihedral similarity index 100% rename from python/examples/pylammps/dihedrals/data.dihedral rename to python/examples/ipython/dihedrals/data.dihedral diff --git a/python/examples/pylammps/dihedrals/dihedral.ipynb b/python/examples/ipython/dihedrals/dihedral.ipynb similarity index 75% rename from python/examples/pylammps/dihedrals/dihedral.ipynb rename to python/examples/ipython/dihedrals/dihedral.ipynb index 240e3e8bd6..4fece8aa58 100644 --- a/python/examples/pylammps/dihedrals/dihedral.ipynb +++ b/python/examples/ipython/dihedrals/dihedral.ipynb @@ -13,7 +13,8 @@ "metadata": {}, "outputs": [], "source": [ - "%matplotlib notebook" + "import matplotlib.pyplot as plt\n", + "from lammps import lammps" ] }, { @@ -22,25 +23,8 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lammps import IPyLammps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L = IPyLammps()" + "L = lammps()\n", + "cmd = L.cmd" ] }, { @@ -51,13 +35,13 @@ "source": [ "import math\n", "\n", - "L.units(\"real\")\n", - "L.atom_style(\"molecular\")\n", + "cmd.units(\"real\")\n", + "cmd.atom_style(\"molecular\")\n", "\n", - "L.boundary(\"f f f\")\n", - "L.neighbor(0.3, \"bin\")\n", + "cmd.boundary(\"f f f\")\n", + "cmd.neighbor(0.3, \"bin\")\n", "\n", - "L.dihedral_style(\"harmonic\")" + "cmd.dihedral_style(\"harmonic\")" ] }, { @@ -66,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.read_data(\"data.dihedral\")" + "cmd.read_data(\"data.dihedral\")" ] }, { @@ -75,8 +59,8 @@ "metadata": {}, "outputs": [], "source": [ - "L.pair_style(\"zero\", 5)\n", - "L.pair_coeff(\"*\", \"*\")" + "cmd.pair_style(\"zero\", 5)\n", + "cmd.pair_coeff(\"*\", \"*\")" ] }, { @@ -85,7 +69,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.mass(1, 1.0)" + "cmd.mass(1, 1.0)" ] }, { @@ -94,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.velocity(\"all\", \"set\", 0.0, 0.0, 0.0)" + "cmd.velocity(\"all\", \"set\", 0.0, 0.0, 0.0)" ] }, { @@ -103,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.run(0);" + "cmd.run(0);" ] }, { @@ -112,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.0)" + "L.ipython.image(zoom=1.0,size=[320,320])" ] }, { @@ -121,7 +105,8 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[3].position" + "x = L.numpy.extract_atom(\"x\")\n", + "print(x[3])" ] }, { @@ -130,7 +115,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[3].position = (1.0, 0.0, 1.0)" + "x[3] = (1.0, 0.0, 1.0)" ] }, { @@ -139,7 +124,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.0)" + "L.ipython.image(zoom=1.0,size=[320,320])" ] }, { @@ -148,7 +133,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.eval(\"pe\")" + "L.get_thermo(\"pe\")" ] }, { @@ -157,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.atoms[3].position = (1.0, 0.0, -1.0)" + "x[3] = (1.0, 0.0, -1.0)" ] }, { @@ -166,7 +151,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.run(0);" + "cmd.run(0)" ] }, { @@ -207,9 +192,9 @@ "source": [ "pe = []\n", "for p in pos:\n", - " L.atoms[3].position = p\n", - " L.run(0);\n", - " pe.append(L.eval(\"pe\"))" + " x[3] = p\n", + " cmd.run(0);\n", + " pe.append(L.get_thermo(\"pe\"))" ] }, { @@ -233,7 +218,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -247,9 +232,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/python/examples/pylammps/elastic/Au.data b/python/examples/ipython/elastic/Au.data similarity index 100% rename from python/examples/pylammps/elastic/Au.data rename to python/examples/ipython/elastic/Au.data diff --git a/python/examples/pylammps/elastic/README b/python/examples/ipython/elastic/README similarity index 100% rename from python/examples/pylammps/elastic/README rename to python/examples/ipython/elastic/README diff --git a/python/examples/pylammps/elastic/elastic.py b/python/examples/ipython/elastic/elastic.py similarity index 99% rename from python/examples/pylammps/elastic/elastic.py rename to python/examples/ipython/elastic/elastic.py index 48f97925da..e69b5394bf 100644 --- a/python/examples/pylammps/elastic/elastic.py +++ b/python/examples/ipython/elastic/elastic.py @@ -158,7 +158,8 @@ def elastic(): parser.add_argument("--up", type=float, default=1.0e-6, help="the deformation magnitude (in strain units)") args = parser.parse_args() - L = PyLammps() + lmp = lammps() + L = lmp.cmd L.units("metal") diff --git a/python/examples/ipython/index.ipynb b/python/examples/ipython/index.ipynb new file mode 100644 index 0000000000..9fdac385fd --- /dev/null +++ b/python/examples/ipython/index.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "666d3036-47d5-44d2-bc1a-ca4b00a9e9b8", + "metadata": {}, + "source": [ + "# LAMMPS IPython Tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "f1422a43-f76b-456b-bf76-61ad92bd4ff0", + "metadata": {}, + "source": [ + "Author: [Richard Berger](mailto:richard.berger@outlook.com)" + ] + }, + { + "cell_type": "markdown", + "id": "8f2ea92d-8cc3-4999-81a0-79aa55bb66ab", + "metadata": {}, + "source": [ + "## Contents\n", + "\n", + "- [Example 1: Using LAMMPS with Python](simple.ipynb)\n", + "- [Example 2: Analyzing LAMMPS thermodynamic data](thermo.ipynb)\n", + "- [Example 3: Using Atom Data](atom.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b41dc533-be6d-4450-8ad7-7345e9f44ea3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/examples/pylammps/montecarlo/mc.ipynb b/python/examples/ipython/montecarlo/mc.ipynb similarity index 75% rename from python/examples/pylammps/montecarlo/mc.ipynb rename to python/examples/ipython/montecarlo/mc.ipynb index 8058a9eb41..b1cfa488eb 100644 --- a/python/examples/pylammps/montecarlo/mc.ipynb +++ b/python/examples/ipython/montecarlo/mc.ipynb @@ -13,16 +13,6 @@ "metadata": {}, "outputs": [], "source": [ - "from __future__ import print_function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", "import matplotlib.pyplot as plt" ] }, @@ -48,7 +38,7 @@ "metadata": {}, "outputs": [], "source": [ - "from lammps import IPyLammps" + "from lammps import lammps" ] }, { @@ -57,7 +47,8 @@ "metadata": {}, "outputs": [], "source": [ - "L = IPyLammps()" + "L = lammps()\n", + "cmd = L.cmd" ] }, { @@ -66,25 +57,25 @@ "metadata": {}, "outputs": [], "source": [ - "L.units(\"lj\")\n", - "L.atom_style(\"atomic\")\n", - "L.atom_modify(\"map array sort\", 0, 0.0)\n", + "cmd.units(\"lj\")\n", + "cmd.atom_style(\"atomic\")\n", + "cmd.atom_modify(\"map array sort\", 0, 0.0)\n", "\n", - "L.dimension(2)\n", + "cmd.dimension(2)\n", "\n", - "L.lattice(\"hex\", 1.0)\n", - "L.region(\"box block\", 0, 10, 0, 5, -0.5, 0.5)\n", + "cmd.lattice(\"hex\", 1.0)\n", + "cmd.region(\"box block\", 0, 10, 0, 5, -0.5, 0.5)\n", "\n", - "L.create_box(1, \"box\")\n", - "L.create_atoms(1, \"box\")\n", - "L.mass(1, 1.0)\n", + "cmd.create_box(1, \"box\")\n", + "cmd.create_atoms(1, \"box\")\n", + "cmd.mass(1, 1.0)\n", "\n", - "L.pair_style(\"lj/cut\", 2.5)\n", - "L.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", - "L.pair_modify(\"shift\", \"yes\")\n", + "cmd.pair_style(\"lj/cut\", 2.5)\n", + "cmd.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", + "cmd.pair_modify(\"shift\", \"yes\")\n", "\n", - "L.neighbor(0.3, \"bin\")\n", - "L.neigh_modify(\"delay\", 0, \"every\", 1, \"check\", \"yes\")" + "cmd.neighbor(0.3, \"bin\")\n", + "cmd.neigh_modify(\"delay\", 0, \"every\", 1, \"check\", \"yes\")" ] }, { @@ -93,7 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.6)" + "L.ipython.image(zoom=1.6,size=[320,320])" ] }, { @@ -102,7 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.run(0);" + "cmd.run(0)" ] }, { @@ -111,7 +102,7 @@ "metadata": {}, "outputs": [], "source": [ - "emin = L.eval(\"pe\")" + "emin = L.get_thermo(\"pe\")" ] }, { @@ -120,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.dump(\"3 all movie 25 movie.mp4 type type zoom 1.6 adiam 1.0\")" + "cmd.dump(\"3 all movie 25 movie.mp4 type type zoom 1.6 adiam 1.0\")" ] }, { @@ -146,11 +137,12 @@ "metadata": {}, "outputs": [], "source": [ - "for i in range(L.system.natoms):\n", - " x, y = L.atoms[i].position\n", + "pos = L.numpy.extract_atom(\"x\")\n", + "for i in range(len(pos)):\n", + " x, y = pos[i][0], pos[i][1]\n", " dx = deltaperturb * random.uniform(-1, 1)\n", " dy = deltaperturb * random.uniform(-1, 1)\n", - " L.atoms[i].position = (x+dx, y+dy)" + " pos[i] = (x+dx, y+dy, 0)" ] }, { @@ -159,7 +151,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.run(0);" + "cmd.run(0)" ] }, { @@ -168,7 +160,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.6)" + "L.ipython.image(zoom=1.6,size=[320,320])" ] }, { @@ -184,7 +176,7 @@ "metadata": {}, "outputs": [], "source": [ - "estart = L.eval(\"pe\")\n", + "estart = L.get_thermo(\"pe\")\n", "elast = estart" ] }, @@ -223,22 +215,23 @@ "metadata": {}, "outputs": [], "source": [ - "natoms = L.system.natoms\n", + "natoms = L.extract_global(\"natoms\")\n", "\n", "for i in range(niterations):\n", + " pos = L.numpy.extract_atom(\"x\")\n", " iatom = random.randrange(0, natoms)\n", - " current_atom = L.atoms[iatom]\n", + " current_atom = pos[iatom]\n", " \n", - " x0, y0 = current_atom.position\n", + " x0, y0 = current_atom[0], current_atom[1]\n", " \n", " dx = deltamove * random.uniform(-1, 1)\n", " dy = deltamove * random.uniform(-1, 1)\n", " \n", - " current_atom.position = (x0+dx, y0+dy)\n", + " pos[iatom] = (x0+dx, y0+dy, 0)\n", " \n", - " L.run(1, \"pre no post no\")\n", + " cmd.run(1, \"pre no post no\")\n", " \n", - " e = L.eval(\"pe\")\n", + " e = L.get_thermo(\"pe\")\n", " energies.append(e)\n", " \n", " if e <= elast:\n", @@ -248,7 +241,7 @@ " naccept += 1\n", " elast = e\n", " else:\n", - " current_atom.position = (x0, y0)" + " pos[iatom] = (x0, y0, 0)" ] }, { @@ -268,7 +261,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.eval(\"pe\")" + "L.get_thermo(\"pe\")" ] }, { @@ -304,7 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.6)" + "L.ipython.image(zoom=1.6, size=[320,320])" ] }, { @@ -314,7 +307,7 @@ "outputs": [], "source": [ "# close dump file to access it\n", - "L.undump(3)" + "cmd.undump(3)" ] }, { @@ -323,7 +316,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.video(\"movie.mp4\")" + "L.ipython.video(\"movie.mp4\")" ] }, { @@ -336,7 +329,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -350,9 +343,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/python/examples/pylammps/mpi4py/hello.py b/python/examples/ipython/mpi4py/hello.py similarity index 100% rename from python/examples/pylammps/mpi4py/hello.py rename to python/examples/ipython/mpi4py/hello.py diff --git a/python/examples/pylammps/mpi4py/in.melt b/python/examples/ipython/mpi4py/in.melt similarity index 100% rename from python/examples/pylammps/mpi4py/in.melt rename to python/examples/ipython/mpi4py/in.melt diff --git a/python/examples/pylammps/mpi4py/melt.py b/python/examples/ipython/mpi4py/melt.py similarity index 61% rename from python/examples/pylammps/mpi4py/melt.py rename to python/examples/ipython/mpi4py/melt.py index ad9c54c0b5..e51a914775 100644 --- a/python/examples/pylammps/mpi4py/melt.py +++ b/python/examples/ipython/mpi4py/melt.py @@ -1,10 +1,10 @@ from mpi4py import MPI -from lammps import PyLammps +from lammps import lammps -L = PyLammps() +L = lammps() L.file('in.melt') if MPI.COMM_WORLD.rank == 0: - pe = L.eval("pe") + pe = L.get_thermo("pe") print("Potential Energy:", pe) diff --git a/python/examples/pylammps/simple.ipynb b/python/examples/ipython/simple.ipynb similarity index 53% rename from python/examples/pylammps/simple.ipynb rename to python/examples/ipython/simple.ipynb index 170a33ebbc..d45c56db18 100644 --- a/python/examples/pylammps/simple.ipynb +++ b/python/examples/ipython/simple.ipynb @@ -4,14 +4,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example 1: Using LAMMPS with PyLammps" + "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The LAMMPS Python package provides multiple interfaces. The `PyLammps` interface is a high-level abstration of the low-level `lammps` interface. `IPyLammps` further extends this interface with functions that are useful for Jupyter notebooks to enable embedding generated graphics and videos." + "# Example 1: Using LAMMPS with Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: [Richard Berger](mailto:richard.berger@outlook.com)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The LAMMPS Python package enables calling the LAMMPS C library API." ] }, { @@ -20,7 +34,7 @@ "source": [ "## Prerequisites\n", "\n", - "Before Running this example, make sure your Python environment can find the LAMMPS shared library (`liblammps.so`) and the LAMMPS Python package is installed. If you followed the [README](README.md) in this folder, this should already be the case. You can also find more information about how to compile LAMMPS and install the LAMMPS Python package in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a dedicated [PyLammps HowTo](https://docs.lammps.org/Howto_pylammps.html)." + "Before running this example, make sure your Python environment can find the LAMMPS shared library (`liblammps.so`) and the LAMMPS Python package is installed. If you followed the [README](README.md) in this folder, this should already be the case. You can also find more information about how to compile LAMMPS and install the LAMMPS Python package in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a dedicated [LAMMPS Python HowTo](https://docs.lammps.org/Howto_python.html)." ] }, { @@ -38,17 +52,17 @@ "metadata": {}, "outputs": [], "source": [ - "from lammps import IPyLammps\n", - "L = IPyLammps()" + "from lammps import lammps\n", + "L = lammps()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "With `PyLammps`/`IPyLammps` you can write LAMMPS simulations similar to the input script language. Take the following LAMMPS input script:\n", + "With the `lammps` class you can write LAMMPS simulations similar to the input script language. Take the following LAMMPS input script:\n", "\n", - "```bash\n", + "```lammps\n", "# 3d Lennard-Jones melt\n", "\n", "units lj\n", @@ -72,7 +86,7 @@ "\n", "thermo 50\n", "```\n", - "The equivalent can be written with `PyLammps`/`IPyLammps`:" + "The equivalent can be written in Python:" ] }, { @@ -83,35 +97,33 @@ "source": [ "# 3d Lennard-Jones melt\n", "\n", - "L.units(\"lj\")\n", - "L.atom_style(\"atomic\")\n", + "L.cmd.units(\"lj\")\n", + "L.cmd.atom_style(\"atomic\")\n", "\n", - "L.lattice(\"fcc\", 0.8442)\n", - "L.region(\"box\", \"block\", 0, 4, 0, 4, 0, 4)\n", - "L.create_box(1, \"box\")\n", - "L.create_atoms(1, \"box\")\n", - "L.mass(1, 1.0)\n", + "L.cmd.lattice(\"fcc\", 0.8442)\n", + "L.cmd.region(\"box\", \"block\", 0, 4, 0, 4, 0, 4)\n", + "L.cmd.create_box(1, \"box\")\n", + "L.cmd.create_atoms(1, \"box\")\n", + "L.cmd.mass(1, 1.0)\n", "\n", - "L.velocity(\"all\", \"create\", 1.44, 87287, \"loop geom\")\n", + "L.cmd.velocity(\"all\", \"create\", 1.44, 87287, \"loop geom\")\n", "\n", - "L.pair_style(\"lj/cut\", 2.5)\n", - "L.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", + "L.cmd.pair_style(\"lj/cut\", 2.5)\n", + "L.cmd.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", "\n", - "L.neighbor(0.3, \"bin\")\n", - "L.neigh_modify(\"delay\", 0, \"every\", 20, \"check no\")\n", + "L.cmd.neighbor(0.3, \"bin\")\n", + "L.cmd.neigh_modify(\"delay\", 0, \"every\", 20, \"check no\")\n", "\n", - "L.fix(\"1\", \"all\", \"nve\")\n", + "L.cmd.fix(\"1\", \"all\", \"nve\")\n", "\n", - "L.thermo(50)" + "L.cmd.thermo(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Visualizing the initial state\n", - "\n", - "`IPyLammps` allows you to visualize the current simulation state with the [image](https://docs.lammps.org/Python_module.html#lammps.IPyLammps.image) command. Here we use it to create an image of the initial state of the system." + "Some LAMMPS commands will produce output that will be visible in the notebook. However, due to buffering, it might not be shown right away. Use the `flush_buffers` method to see all the output that has been written so far." ] }, { @@ -120,7 +132,109 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.0)" + "L.flush_buffers()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An alternative to this is to enable auto flushing after each command by setting `cmd.auto_flush` to `True`. Each command will then call `flush_buffers()` automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "L.cmd.auto_flush = True\n", + "L.cmd.info(\"system\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In many cases the LAMMPS output will become excessive, which is why you may want to suppress it. For this purpose we provide a IPython extension in the `lammps.ipython` package. To load the extension, add a code cell with the following content:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext lammps.ipython" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the extension is loaded you have access to the `%%capture_lammps_output` magic. In its simplest form it can be used to supress LAMMPS output:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_lammps_output\n", + "L.cmd.info(\"system\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use the same `%%capture_lammps_output` magic to store the output in a variable by providing a variable name:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_lammps_output out\n", + "L.cmd.info(\"system\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case we are storing the output in a `out` variable. Note the output is only available after the cell has been executed, not within the same cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(out)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing the initial state\n", + "\n", + "The `lammps` class also has an `ipython` attribute which provides some basic visualization capabilities in IPython Jupyter notebooks. E.g., you can visualize the current simulation state with the [image](https://docs.lammps.org/Python_module.html#lammps.ipython_wrapper.image) command. Here we use it to create an image of the initial state of the system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "L.ipython.image()" ] }, { @@ -129,7 +243,7 @@ "source": [ "## Running simulations\n", "\n", - "Use the `run` command to start the simulation. In Jupyter the return value of the last command will be displayed. The `run` command will return the output of the simulation." + "Use the `run` command to start the simulation. It will print the output of the simulation." ] }, { @@ -138,23 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "L.run(150)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can suppress it by adding a semicolon `;`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.run(100);" + "L.cmd.run(250)" ] }, { @@ -170,127 +268,23 @@ "metadata": {}, "outputs": [], "source": [ - "L.image(zoom=1.0)" + "L.ipython.image(zoom=1.0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Post-processing thermo output\n", + "## Conclusion\n", + "This covered the basics of creating an instance of LAMMPS from Python, passing commands to LAMMPS and potentially supressing or capturing its output, and visualizing the system. In the [following tutorial](thermo.ipynb) we will look at how to process thermodynamic output from LAMMPS.\n", "\n", - "Independent of whether or not you suppress or show the output of the `run` command, `PyLammps` will record the output. Each `run` command creates a new entry in the `L.runs` list. So far our PyLammps instance `L` executed two `run` commands:" + "
Next" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(L.runs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each entry contains information about the simulation run, including the thermo output for the printed out time steps.\n", - "\n", - "```bash\n", - "# thermo output of a LAMMPS simulation run\n", - "Step Temp E_pair E_mol TotEng Press\n", - " 0 1.44 -6.7733681 0 -4.6218056 -5.0244179\n", - " 50 0.70303849 -5.6796164 0 -4.629178 0.50453907\n", - " 100 0.72628044 -5.7150774 0 -4.6299123 0.29765862\n", - " 150 0.78441711 -5.805142 0 -4.6331125 -0.086709661\n", - "```\n", - "\n", - "`PyLammps` already parses this information and makes it available as dictionaries and arrays." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, the first run was 150 time steps, with printing out a line every 50 steps. You can access the list of time steps using `{entry}.thermo.Step`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0].thermo.Step" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The corresponding values of each thermo quantity are also accessed this way:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0].thermo.TotEng" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Together you can use this information to run post-processing on these values or even plot it using `matplotlib`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "plt.xlabel('time step')\n", - "plt.ylabel('Total Energy')\n", - "plt.plot(L.runs[0].thermo.Step, L.runs[0].thermo.TotEng)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -304,9 +298,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/python/examples/ipython/thermo.ipynb b/python/examples/ipython/thermo.ipynb new file mode 100644 index 0000000000..ea465f5f79 --- /dev/null +++ b/python/examples/ipython/thermo.ipynb @@ -0,0 +1,305 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 2: Analyzing LAMMPS thermodynamic data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: [Richard Berger](mailto:richard.berger@outlook.com)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial assumes you've completed the [first example](simple.ipynb) and understand the basics of running LAMMPS through Python. In this tutorial we will build on top of that example and look at how to extract thermodynamic data produced by LAMMPS into Python and visualize it. Let's first start by recreating our simple melt example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext lammps.ipython" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lammps import lammps\n", + "L = lammps()\n", + "L.cmd.auto_flush = True\n", + "\n", + "def init_melt_system(L):\n", + " # 3d Lennard-Jones melt\n", + " L.cmd.clear()\n", + " L.cmd.units(\"lj\")\n", + " L.cmd.atom_style(\"atomic\")\n", + " \n", + " L.cmd.lattice(\"fcc\", 0.8442)\n", + " L.cmd.region(\"box\", \"block\", 0, 4, 0, 4, 0, 4)\n", + " L.cmd.create_box(1, \"box\")\n", + " L.cmd.create_atoms(1, \"box\")\n", + " L.cmd.mass(1, 1.0)\n", + " \n", + " L.cmd.velocity(\"all\", \"create\", 1.44, 87287, \"loop geom\")\n", + " \n", + " L.cmd.pair_style(\"lj/cut\", 2.5)\n", + " L.cmd.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", + " \n", + " L.cmd.neighbor(0.3, \"bin\")\n", + " L.cmd.neigh_modify(\"delay\", 0, \"every\", 20, \"check no\")\n", + " \n", + " L.cmd.fix(\"1\", \"all\", \"nve\")\n", + " \n", + " L.cmd.thermo(50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we take advantage of the fact that we can write regular Python functions to organize our LAMMPS simulation. This allows us to clear and initialize a new system by calling the `init_melt_system()` function. With this we can now go ahead an run this simulation for 100 steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "init_melt_system(L)\n", + "L.cmd.run(100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extracting thermodynamic data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the above output we see that LAMMPS prints out thermodynamic data for steps 0, 50 and 100.\n", + "\n", + "```\n", + " Step Temp E_pair E_mol TotEng Press \n", + " 0 1.44 -6.7733681 0 -4.6218056 -5.0244179 \n", + " 50 0.70303849 -5.6796164 0 -4.629178 0.50453907 \n", + " 100 0.72628044 -5.7150774 0 -4.6299123 0.29765862\n", + "```\n", + "\n", + "We could parse the text output and extract the necessary information, but this has proven to be error-prone and clunky, especially in cases where other output gets interleaved with thermo output lines. Instead, we can make use of the Python integration within LAMMPS to execute arbitrary Python code during time steps using `fix python/invoke`. We can extract the thermodynamic data directly using the LAMMPS Python interface and process it in any way we want.\n", + "\n", + "For this we first define the data structure we want to use to store the data. For each column of the thermodynamic data we want to store a list of values for each time step. Let's use a Python `dict` with the following structure:\n", + "\n", + "```python\n", + "{'Step': [0, 50, 100, ...], 'Temp': [...], 'E_pair': [...], 'E_mol': [...], 'TotEng': [...], 'Press': [...]}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To start, let's define an empty `dict` and call it `current_run`. As the simulation progresses, we append new data into this dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_run = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's define a function that should be executed every time step a thermodynamic output line would be written. This function takes a `lammps` class instance and through it can access LAMMPS state and data. We can use the [`last_thermo()`](https://docs.lammps.org/Python_module.html#lammps.lammps.last_thermo) function of the `lammps` class to get the latest thermodynamic data as a dictionary. This data is all we need to populate our `current_run` data structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def append_thermo_data(lmp):\n", + " for k, v in lmp.last_thermo().items():\n", + " current_run.setdefault(k, []).append(v)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With these two pieces in place, it is now time to tell LAMMPS about how we want to call this function.\n", + "\n", + "First, let's suppress any LAMMPS output via `%%capture_lammps_output` and reinitialize our system with `init_melt_system()` so our system is back in its initial state and the time step is back to 0.\n", + "\n", + "Next, we add a new fix `python/invoke` that should execute every 50 time steps, the same as our `thermo 50` command above. At the end of every 50 time steps (including the first one), it should call the `append_thermo_data` function we just defined. Notice we can just pass the function as parameter. Finally, we tell LAMMPS to run for 250 steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_lammps_output\n", + "init_melt_system(L)\n", + "L.cmd.fix(\"myfix\", \"all\", \"python/invoke\", 50, \"end_of_step\", append_thermo_data)\n", + "L.cmd.run(250)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect our `current_run` dictionary after the run has completed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the time steps 0, 50, 100, 150, and 200 were added to dictionary. However, the last time step 250 is still missing. For this we need to manually add a final call to our `append_thermo_data()` helper function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "append_thermo_data(L)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With this our `current_run` dictionary now has all the data of the completed run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting thermodynamic data with matplotlib\n", + "\n", + "Now that we have our data available as Python variables, we can easily use other libraries for visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.xlabel('time step')\n", + "plt.ylabel('Total Energy')\n", + "plt.plot(current_run['Step'], current_run['TotEng'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Pandas library\n", + "\n", + "Since we can call any Python code from LAMMPS, the above example can also be rewritten using the Pandas library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture_lammps_output\n", + "import pandas as pd\n", + "\n", + "current_run = pd.DataFrame()\n", + "\n", + "def append_thermo_data(lmp):\n", + " global current_run\n", + " current_time_step = pd.DataFrame.from_records([lmp.last_thermo()])\n", + " current_run = pd.concat([current_run, current_time_step], ignore_index=True)\n", + "\n", + "init_melt_system(L)\n", + "L.cmd.fix(\"myfix\", \"all\", \"python/invoke\", 50, \"end_of_step\", append_thermo_data)\n", + "L.cmd.run(250)\n", + "append_thermo_data(L)\n", + "current_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_run.plot(x='Step', y='TotEng')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/examples/pylammps/interface_usage.ipynb b/python/examples/pylammps/interface_usage.ipynb deleted file mode 100644 index 18902caec9..0000000000 --- a/python/examples/pylammps/interface_usage.ipynb +++ /dev/null @@ -1,546 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Example 2: Using the PyLammps interface" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "\n", - "Before running this example, make sure your Python environment can find the LAMMPS shared library (`liblammps.so`) and the LAMMPS Python package is installed. If you followed the [README](README.md) in this folder, this should already be the case. You can also find more information about how to compile LAMMPS and install the LAMMPS Python package in the [LAMMPS manual](https://docs.lammps.org/Python_install.html). There is also a dedicated [PyLammps HowTo](https://docs.lammps.org/Howto_pylammps.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lammps import IPyLammps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L = IPyLammps()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 3d Lennard-Jones melt\n", - "L.units(\"lj\")\n", - "L.atom_style(\"atomic\")\n", - "L.atom_modify(\"map array\")\n", - "\n", - "L.lattice(\"fcc\", 0.8442)\n", - "L.region(\"box block\", 0, 4, 0, 4, 0, 4)\n", - "L.create_box(1, \"box\")\n", - "L.create_atoms(1, \"box\")\n", - "L.mass(1, 1.0)\n", - "\n", - "L.velocity(\"all create\", 1.44, 87287, \"loop geom\")\n", - "\n", - "L.pair_style(\"lj/cut\", 2.5)\n", - "L.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", - "\n", - "L.neighbor(0.3, \"bin\")\n", - "L.neigh_modify(\"delay 0 every 20 check no\")\n", - "\n", - "L.fix(\"1 all nve\")\n", - "\n", - "L.variable(\"fx atom fx\")\n", - "\n", - "L.run(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualize the initial state" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.image(zoom=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Queries about LAMMPS simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system.natoms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.communication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.fixes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.computes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.dumps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with LAMMPS Variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"a index 2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"t equal temp\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if sys.version_info < (3, 0):\n", - " # In Python 2 'print' is a restricted keyword, which is why you have to use the lmp_print function instead.\n", - " x = float(L.lmp_print('\"${a}\"'))\n", - "else:\n", - " # In Python 3 the print function can be redefined.\n", - " # x = float(L.print('\"${a}\"')\")\n", - " \n", - " # To avoid a syntax error in Python 2 executions of this notebook, this line is packed into an eval statement\n", - " x = float(eval(\"L.print('\\\"${a}\\\"')\"))\n", - "x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['t'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.eval(\"v_t/2.0\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"b index a b c\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['b'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.eval(\"v_b\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['b'].definition" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.lmp.command('variable i loop 10')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"i loop 10\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['i'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.next(\"i\")\n", - "L.variables['i'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.eval(\"ke\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Accessing Atom data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dir(L.atoms[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].position" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].velocity" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].force" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.atoms[0].type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['fx'].value" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Accessing thermo data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0].thermo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.runs[0].thermo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dir(L.runs[0].thermo)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving session to as LAMMPS input file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "PyLammps can keep track of all LAMMPS commands that are executed. This allows you to prototype a script and then later on save it as a regular input script:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L = IPyLammps()\n", - "\n", - "# enable command history\n", - "L.enable_cmd_history = True\n", - "\n", - "# 3d Lennard-Jones melt\n", - "L.units(\"lj\")\n", - "L.atom_style(\"atomic\")\n", - "L.atom_modify(\"map array\")\n", - "\n", - "L.lattice(\"fcc\", 0.8442)\n", - "L.region(\"box block\", 0, 4, 0, 4, 0, 4)\n", - "L.create_box(1, \"box\")\n", - "L.create_atoms(1, \"box\")\n", - "L.mass(1, 1.0)\n", - "\n", - "L.velocity(\"all create\", 1.44, 87287, \"loop geom\")\n", - "\n", - "L.pair_style(\"lj/cut\", 2.5)\n", - "L.pair_coeff(1, 1, 1.0, 1.0, 2.5)\n", - "\n", - "L.neighbor(0.3, \"bin\")\n", - "L.neigh_modify(\"delay 0 every 20 check no\")\n", - "\n", - "L.fix(\"1 all nve\")\n", - "\n", - "L.run(10)\n", - "\n", - "# write LAMMPS input script with all commands executed so far (including implicit ones)\n", - "L.write_script(\"in.output\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!cat in.output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/python/lammps/core.py b/python/lammps/core.py index 30df44f050..510385c175 100644 --- a/python/lammps/core.py +++ b/python/lammps/core.py @@ -59,6 +59,87 @@ class ExceptionCheck: # ------------------------------------------------------------------------- +class command_wrapper(object): + def __init__(self, lmp): + self.lmp = lmp + self.auto_flush = False + + def lmp_print(self, s): + """ needed for Python2 compatibility, since print is a reserved keyword """ + return self.__getattr__("print")(s) + + def __dir__(self): + return sorted(set(['angle_coeff', 'angle_style', 'atom_modify', 'atom_style', 'atom_style', + 'bond_coeff', 'bond_style', 'boundary', 'change_box', 'communicate', 'compute', + 'create_atoms', 'create_box', 'delete_atoms', 'delete_bonds', 'dielectric', + 'dihedral_coeff', 'dihedral_style', 'dimension', 'dump', 'fix', 'fix_modify', + 'group', 'improper_coeff', 'improper_style', 'include', 'kspace_modify', + 'kspace_style', 'lattice', 'mass', 'minimize', 'min_style', 'neighbor', + 'neigh_modify', 'newton', 'nthreads', 'pair_coeff', 'pair_modify', + 'pair_style', 'processors', 'read', 'read_data', 'read_restart', 'region', + 'replicate', 'reset_timestep', 'restart', 'run', 'run_style', 'thermo', + 'thermo_modify', 'thermo_style', 'timestep', 'undump', 'unfix', 'units', + 'variable', 'velocity', 'write_restart'] + self.lmp.available_styles("command"))) + + def _wrap_args(self, x): + if callable(x): + if sys.version_info < (3,): + raise Exception("Passing functions or lambdas directly as arguments is only supported in Python 3 or newer") + import hashlib + import __main__ + sha = hashlib.sha256() + sha.update(str(x).encode()) + func_name = f"_lmp_cb_{sha.hexdigest()}" + def handler(*args, **kwargs): + args = list(args) + args[0] = lammps(ptr=args[0]) + x(*args) + setattr(__main__, func_name, handler) + return func_name + return x + + def __getattr__(self, name): + """ + This method is where the Python 'magic' happens. If a method is not + defined by the class command_wrapper, it assumes it is a LAMMPS command. It takes + all the arguments, concatinates them to a single string, and executes it using + :py:meth:`lammps.command()`. + + Starting with Python 3.6 it also supports keyword arguments. key=value is + transformed into 'key value'. Note, since these have come last in the + parameter list, only a subset of LAMMPS commands can be used with this + syntax. + + LAMMPS commands that accept callback functions (such as fix python/invoke) + can be passed functions and lambdas directly. The first argument of such + callbacks will be an lammps object constructed from the passed LAMMPS + pointer. + + :return: line or list of lines of output, None if no output + :rtype: list or string + """ + def handler(*args, **kwargs): + cmd_args = [name] + [str(self._wrap_args(x)) for x in args] + + if len(kwargs) > 0 and sys.version_info < (3,6): + raise Exception("Keyword arguments are only supported in Python 3.6 or newer") + + # Python 3.6+ maintains ordering of kwarg keys + for k in kwargs.keys(): + cmd_args.append(k) + if type(kwargs[k]) == bool: + cmd_args.append("true" if kwargs[k] else "false") + else: + cmd_args.append(str(self._wrap_args(kwargs[k]))) + + cmd = ' '.join(cmd_args) + self.lmp.command(cmd) + if self.auto_flush: + self.lmp.flush_buffers() + return handler + +# ------------------------------------------------------------------------- + class lammps(object): """Create an instance of the LAMMPS Python class. @@ -103,6 +184,8 @@ class lammps(object): winpath = os.environ.get("LAMMPSDLLPATH") self.lib = None self.lmp = None + self._cmd = None + self._ipython = None # if a pointer to a LAMMPS object is handed in # when being called from a Python interpreter @@ -509,6 +592,61 @@ class lammps(object): # ------------------------------------------------------------------------- + @property + def cmd(self): + """ Return object that acts as LAMMPS command wrapper + + It provides alternative to :py:meth:`lammps.command` to call LAMMPS + commands as if they were regular Python functions and enables auto-complete + in interactive Python sessions. + + .. code-block:: python + + from lammps import lammps + + # melt example + L = lammps() + L.cmd.units("lj") + L.cmd.atom_style("atomic") + L.cmd.lattice("fcc", 0.8442) + L.cmd.region("box block", 0, 10, 0, 10, 0, 10) + L.cmd.create_box(1, "box") + L.cmd.create_atoms(1, "box") + L.cmd.mass(1, 1.0) + L.cmd.velocity("all create", 3.0, 87287, "loop geom") + L.cmd.pair_style("lj/cut", 2.5) + L.cmd.pair_coeff(1, 1, 1.0, 1.0, 2.5) + L.cmd.neighbor(0.3, "bin") + L.cmd.neigh_modify(every=20, delay=0, check=False) + L.cmd.fix(1, "all nve") + L.cmd.thermo(50) + L.cmd.run(250) + + :return: instance of command_wrapper object + :rtype: command_wrapper + """ + if not self._cmd: + self._cmd = command_wrapper(self) + return self._cmd + + # ------------------------------------------------------------------------- + + @property + def ipython(self): + """ Return object to access ipython extensions + + Adds commands for visualization in IPython and Jupyter Notebooks. + + :return: instance of ipython wrapper object + :rtype: ipython.wrapper + """ + if not self._ipython: + from .ipython import wrapper + self._ipython = wrapper(self) + return self._ipython + + # ------------------------------------------------------------------------- + def close(self): """Explicitly delete a LAMMPS instance through the C-library interface. diff --git a/python/lammps/ipython/__init__.py b/python/lammps/ipython/__init__.py new file mode 100644 index 0000000000..c07a5ff5e5 --- /dev/null +++ b/python/lammps/ipython/__init__.py @@ -0,0 +1,23 @@ +# ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# https://www.lammps.org/ Sandia National Laboratories +# LAMMPS Development team: developers@lammps.org +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- + +################################################################################ +# IPython/Jupyter Notebook additions +# Written by Richard Berger +################################################################################ + +from .wrapper import wrapper +from .magics import LammpsMagics + +def load_ipython_extension(ipython): + ipython.register_magics(LammpsMagics) diff --git a/python/lammps/ipython/magics.py b/python/lammps/ipython/magics.py new file mode 100644 index 0000000000..5ad2ae2d6f --- /dev/null +++ b/python/lammps/ipython/magics.py @@ -0,0 +1,75 @@ +# ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# https://www.lammps.org/ Sandia National Laboratories +# LAMMPS Development team: developers@lammps.org +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- + +################################################################################ +# IPython/Jupyter Notebook additions +# Written by Richard Berger +################################################################################ + +import io +import os +import sys +import tempfile +from IPython.core.magic import (Magics, magics_class, cell_magic) +import IPython.core.magic_arguments as magic_arguments + +class OutputCapture(object): + """ Utility class to capture LAMMPS library output """ + def __init__(self): + self.stdout_fd = 1 + self.captured_output = "" + + def __enter__(self): + self.tmpfile = tempfile.TemporaryFile(mode='w+b') + + sys.stdout.flush() + + # make copy of original stdout + self.stdout_orig = os.dup(self.stdout_fd) + + # replace stdout and redirect to temp file + os.dup2(self.tmpfile.fileno(), self.stdout_fd) + return self + + def __exit__(self, exc_type, exc_value, traceback): + os.dup2(self.stdout_orig, self.stdout_fd) + os.close(self.stdout_orig) + self.tmpfile.close() + + @property + def output(self): + sys.stdout.flush() + self.tmpfile.flush() + self.tmpfile.seek(0, io.SEEK_SET) + self.captured_output = self.tmpfile.read().decode('utf-8') + return self.captured_output + +# ------------------------------------------------------------------------- + +@magics_class +class LammpsMagics(Magics): + @magic_arguments.magic_arguments() + @magic_arguments.argument('output', type=str, default='', nargs='?', + help="""The name of the variable in which to store output. + + If unspecified, captured output is discarded. + """ + ) + @cell_magic + def capture_lammps_output(self, line, cell): + """run the cell, capturing LAMMPS stdout and stderr.""" + args = magic_arguments.parse_argstring(self.capture_lammps_output, line) + with OutputCapture() as capture: + self.shell.run_cell(cell) + if args.output: + self.shell.user_ns[args.output] = capture.output diff --git a/python/lammps/ipython/wrapper.py b/python/lammps/ipython/wrapper.py new file mode 100644 index 0000000000..729c0d62bf --- /dev/null +++ b/python/lammps/ipython/wrapper.py @@ -0,0 +1,113 @@ +# ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# https://www.lammps.org/ Sandia National Laboratories +# LAMMPS Development team: developers@lammps.org +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- + +################################################################################ +# IPython/Jupyter Notebook additions +# Written by Richard Berger +################################################################################ + +class wrapper(object): + """lammps API IPython Wrapper + + This is a wrapper class that provides additional methods on top of an + existing :py:class:`lammps` instance. It provides additional methods + that allow create and/or embed visualizations created by native LAMMPS + commands. + + There is no need to explicitly instantiate this class. Each instance + of :py:class:`lammps` has a :py:attr:`ipython ` property + that returns an instance. + + :param lmp: instance of the :py:class:`lammps` class + :type lmp: lammps + """ + def __init__(self, lmp): + self.lmp = lmp + + def image(self, filename="snapshot.png", group="all", color="type", diameter="type", + size=None, view=None, center=None, up=None, zoom=1.0, background_color="white"): + """ Generate image using write_dump command and display it + + See :doc:`dump image ` for more information. + + :param filename: Name of the image file that should be generated. The extension determines whether it is PNG or JPEG + :type filename: string + :param group: the group of atoms write_image should use + :type group: string + :param color: name of property used to determine color + :type color: string + :param diameter: name of property used to determine atom diameter + :type diameter: string + :param size: dimensions of image + :type size: tuple (width, height) + :param view: view parameters + :type view: tuple (theta, phi) + :param center: center parameters + :type center: tuple (flag, center_x, center_y, center_z) + :param up: vector pointing to up direction + :type up: tuple (up_x, up_y, up_z) + :param zoom: zoom factor + :type zoom: float + :param background_color: background color of scene + :type background_color: string + + :return: Image instance used to display image in notebook + :rtype: :py:class:`IPython.core.display.Image` + """ + cmd_args = [group, "image", filename, color, diameter] + + if size is not None: + width = size[0] + height = size[1] + cmd_args += ["size", width, height] + + if view is not None: + theta = view[0] + phi = view[1] + cmd_args += ["view", theta, phi] + + if center is not None: + flag = center[0] + Cx = center[1] + Cy = center[2] + Cz = center[3] + cmd_args += ["center", flag, Cx, Cy, Cz] + + if up is not None: + Ux = up[0] + Uy = up[1] + Uz = up[2] + cmd_args += ["up", Ux, Uy, Uz] + + if zoom is not None: + cmd_args += ["zoom", zoom] + + cmd_args.append("modify backcolor " + background_color) + + self.lmp.cmd.write_dump(*cmd_args) + from IPython.core.display import Image + return Image(filename) + + def video(self, filename): + """ + Load video from file + + Can be used to visualize videos from :doc:`dump movie `. + + :param filename: Path to video file + :type filename: string + :return: HTML Video Tag used by notebook to embed a video + :rtype: :py:class:`IPython.display.HTML` + """ + from IPython.display import HTML + return HTML("") diff --git a/python/setup.py b/python/setup.py index cf4f1e7c4a..3bcab8faa1 100644 --- a/python/setup.py +++ b/python/setup.py @@ -26,7 +26,7 @@ class BinaryDistribution(Distribution): return True if version_info.major >= 3: - pkgs = ['lammps', 'lammps.mliap'] + pkgs = ['lammps', 'lammps.mliap', 'lammps.ipython'] else: pkgs = ['lammps'] From e45ef5adc03629bad2d07583f1e1fee49272022e Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Sun, 24 Nov 2024 00:03:09 -0700 Subject: [PATCH 028/161] unittest: add Python command_wrapper test --- unittest/python/CMakeLists.txt | 5 ++ unittest/python/python-cmdwrapper.py | 97 ++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 unittest/python/python-cmdwrapper.py diff --git a/unittest/python/CMakeLists.txt b/unittest/python/CMakeLists.txt index f3b851620c..881c18423d 100644 --- a/unittest/python/CMakeLists.txt +++ b/unittest/python/CMakeLists.txt @@ -105,6 +105,11 @@ if(Python_EXECUTABLE) set_tests_properties(PythonPyLammps PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") endif() + add_test(NAME PythonCmdWrapper + COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-cmdwrapper.py -v + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + set_tests_properties(PythonCmdWrapper PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") + add_test(NAME PythonFormats COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-formats.py -v WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/unittest/python/python-cmdwrapper.py b/unittest/python/python-cmdwrapper.py new file mode 100644 index 0000000000..50d2092e95 --- /dev/null +++ b/unittest/python/python-cmdwrapper.py @@ -0,0 +1,97 @@ +import os,unittest +from lammps import lammps + +try: + import numpy + NUMPY_INSTALLED = True +except ImportError: + NUMPY_INSTALLED = False + +@unittest.skipIf(not NUMPY_INSTALLED, "numpy is not available") +class PythonCmdWrapper(unittest.TestCase): + def setUp(self): + machine = None + if 'LAMMPS_MACHINE_NAME' in os.environ: + machine=os.environ['LAMMPS_MACHINE_NAME'] + self.lmp = lammps(name=machine, cmdargs=['-nocite', '-log','none', '-echo', 'screen']) + self.lmp.cmd.units("lj") + self.lmp.cmd.atom_style("atomic") + self.lmp.cmd.atom_modify("map array") + + if 'LAMMPS_CMAKE_CACHE' in os.environ: + self.cmake_cache = {} + + with open(os.environ['LAMMPS_CMAKE_CACHE'], 'r') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#') or line.startswith('//'): continue + parts = line.split('=') + key, value_type = parts[0].split(':') + if len(parts) > 1: + value = parts[1] + if value_type == "BOOL": + value = (value.upper() == "ON") + else: + value = None + self.cmake_cache[key] = value + + def tearDown(self): + self.lmp.close() + del self.lmp + + def test_version(self): + self.assertGreaterEqual(self.lmp.version(), 20200824) + + def test_create_atoms(self): + self.lmp.cmd.region("box block", 0, 2, 0, 2, 0, 2) + self.lmp.cmd.create_box(1, "box") + + x = [ + 1.0, 1.0, 1.0, + 1.0, 1.0, 1.5 + ] + + types = [1, 1] + + self.assertEqual(self.lmp.create_atoms(2, id=None, type=types, x=x), 2) + self.assertEqual(self.lmp.extract_global("natoms"), 2) + pos = self.lmp.numpy.extract_atom("x") + self.assertEqual(pos.shape[0], 2) + numpy.testing.assert_array_equal(pos[0], tuple(x[0:3])) + numpy.testing.assert_array_equal(pos[1], tuple(x[3:6])) + + def test_thermo_capture(self): + self.lmp.cmd.lattice("fcc", 0.8442), + self.lmp.cmd.region("box block", 0, 4, 0, 4, 0, 4) + self.lmp.cmd.create_box(1, "box") + self.lmp.cmd.create_atoms(1, "box") + self.lmp.cmd.mass(1, 1.0) + self.lmp.cmd.velocity("all create", 1.44, 87287, "loop geom") + self.lmp.cmd.pair_style("lj/cut", 2.5) + self.lmp.cmd.pair_coeff(1, 1, 1.0, 1.0, 2.5) + self.lmp.cmd.neighbor(0.3, "bin") + self.lmp.cmd.neigh_modify("delay 0 every 20 check no") + self.lmp.cmd.fix("1 all nve") + + current_run = {} + + def append_thermo_data(lmp): + for k, v in lmp.last_thermo().items(): + current_run.setdefault(k, []).append(v) + + # thermo data is only captured during a run if PYTHON package is enabled + # without it, it will only capture the final thermo at completion + nvalues = 1 + if self.lmp.has_package("PYTHON"): + self.lmp.cmd.fix("myfix", "all", "python/invoke", 10, "end_of_step", append_thermo_data) + nvalues = 2 + + self.lmp.cmd.run(10) + append_thermo_data(self.lmp) + + for k in ('Step', 'Temp', 'E_pair', 'E_mol', 'TotEng', 'Press'): + self.assertIn(k, current_run) + self.assertEqual(len(current_run[k]), nvalues) + +if __name__ == "__main__": + unittest.main() From 754aa1c73f306729abe5866cf22a29d1732d197d Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Sun, 24 Nov 2024 01:36:01 -0700 Subject: [PATCH 029/161] python: doc and example updates --- doc/src/Howto.rst | 1 - doc/src/Howto_pylammps.rst | 2 +- doc/src/Python_atoms.rst | 16 ++ doc/src/Python_create.rst | 9 +- doc/src/Python_execute.rst | 28 +- doc/src/Python_head.rst | 1 + doc/src/Python_jupyter.rst | 48 ++++ doc/src/Python_module.rst | 4 +- doc/src/Python_objects.rst | 123 +++------ doc/src/Python_properties.rst | 135 ++-------- python/examples/ipython/atoms.ipynb | 253 +----------------- .../examples/ipython/dihedrals/dihedral.ipynb | 11 +- python/examples/ipython/index.ipynb | 8 +- python/examples/ipython/montecarlo/mc.ipynb | 11 +- python/examples/ipython/simple.ipynb | 2 +- python/examples/ipython/thermo.ipynb | 20 +- 16 files changed, 206 insertions(+), 466 deletions(-) create mode 100644 doc/src/Python_jupyter.rst diff --git a/doc/src/Howto.rst b/doc/src/Howto.rst index 16620bf47a..5a63e2b1c4 100644 --- a/doc/src/Howto.rst +++ b/doc/src/Howto.rst @@ -104,6 +104,5 @@ Tutorials howto Howto_lammps_gui Howto_moltemplate Howto_pylammps - Howto_python Howto_wsl diff --git a/doc/src/Howto_pylammps.rst b/doc/src/Howto_pylammps.rst index a8371f1366..4a9b985bf6 100644 --- a/doc/src/Howto_pylammps.rst +++ b/doc/src/Howto_pylammps.rst @@ -3,4 +3,4 @@ PyLammps Tutorial The PyLammps interface is deprecated and will be removed in a future release of LAMMPS. As such, the PyLammps version of this tutorial has been removed and is -replaced by the :doc:`Howto_python`. +replaced by the :doc:`Python_head`. diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst index 0a445f9b6b..2d07cc2326 100644 --- a/doc/src/Python_atoms.rst +++ b/doc/src/Python_atoms.rst @@ -26,14 +26,30 @@ against invalid accesses. lmp = lammps() lmp.file("in.sysinit") + + # Read/Write access via ctypes nlocal = lmp.extract_global("nlocal") x = lmp.extract_atom("x") for i in range(nlocal): print("(x,y,z) = (", x[i][0], x[i][1], x[i][2], ")") + # Read/Write access via NumPy arrays + atom_id = L.numpy.extract_atom("id") + atom_type = L.numpy.extract_atom("type") + x = L.numpy.extract_atom("x") + v = L.numpy.extract_atom("v") + f = L.numpy.extract_atom("f") + + # set position in 2D simulation + x[0] = (1.0, 0.0) + + # set position in 3D simulation + x[0] = (1.0, 0.0, 1.) + lmp.close() + **Methods**: * :py:meth:`extract_atom() `: extract a per-atom quantity diff --git a/doc/src/Python_create.rst b/doc/src/Python_create.rst index c1444c400e..9301829ea9 100644 --- a/doc/src/Python_create.rst +++ b/doc/src/Python_create.rst @@ -6,11 +6,10 @@ Creating or deleting a LAMMPS object ==================================== With the Python interface the creation of a :cpp:class:`LAMMPS -` instance is included in the constructors for the -:py:class:`lammps `, :py:class:`PyLammps `, -and :py:class:`IPyLammps ` classes. -Internally it will call either :cpp:func:`lammps_open` or :cpp:func:`lammps_open_no_mpi` from the C -library API to create the class instance. +` instance is included in the constructor for the +:py:class:`lammps ` class. Internally it will call either +:cpp:func:`lammps_open` or :cpp:func:`lammps_open_no_mpi` from the C library +API to create the class instance. All arguments are optional. The *name* argument allows loading a LAMMPS shared library that is named ``liblammps_machine.so`` instead of diff --git a/doc/src/Python_execute.rst b/doc/src/Python_execute.rst index 28c3ff5575..a9d65133db 100644 --- a/doc/src/Python_execute.rst +++ b/doc/src/Python_execute.rst @@ -26,7 +26,7 @@ demonstrates the use of :py:func:`lammps.file()`, :py:func:`lammps.command()`, lmp.command('variable zpos index 1.0') # create 10 groups with 10 atoms each - cmds = ["group g{} id {}:{}".format(i,10*i+1,10*(i+1)) for i in range(10)] + cmds = [f"group g{i} id {10*i+1}:{10*(i+1)}" for i in range(10)] lmp.commands_list(cmds) # run commands from a multi-line string @@ -38,10 +38,9 @@ demonstrates the use of :py:func:`lammps.file()`, :py:func:`lammps.command()`, """ lmp.commands_string(block) - -Unlike the lammps API, the PyLammps/IPyLammps APIs allow running LAMMPS -commands by calling equivalent member functions of :py:class:`PyLammps ` -and :py:class:`IPyLammps ` instances. +For convenience, the :py:class:`lammps ` class also provides a +command wrapper ``cmd`` that turns any LAMMPS command into a regular function +call. For instance, the following LAMMPS command @@ -49,8 +48,7 @@ For instance, the following LAMMPS command region box block 0 10 0 5 -0.5 0.5 -can be executed using with the lammps API with the following Python code if ``lmp`` is an -instance of :py:class:`lammps `: +would normally be executed with the following Python code: .. code-block:: python @@ -59,7 +57,7 @@ instance of :py:class:`lammps `: lmp = lammps() lmp.command("region box block 0 10 0 5 -0.5 0.5") -With the PyLammps interface, any LAMMPS command can be split up into arbitrary parts. +With the ``cmd`` wrapper, any LAMMPS command can be split up into arbitrary parts. These parts are then passed to a member function with the name of the :doc:`command `. For the :doc:`region ` command that means the :code:`region()` method can be called. The arguments of the command can be passed as one string, or @@ -82,25 +80,31 @@ member function takes the entire parameter list and transparently merges it to a string. The benefit of this approach is avoiding redundant command calls and easier -parameterization. In the lammps API parameterization needed to be done -manually by creating formatted command strings. +parameterization. With `command`, `commands_list`, and `commands_string` the +parameterization needed to be done manually by creating formatted command +strings. .. code-block:: python lmp.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) -In contrast, methods of PyLammps accept parameters directly and will convert +In contrast, methods of the `cmd` wrapper accept parameters directly and will convert them automatically to a final command string. .. code-block:: python L.cmd.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) +.. note:: + + When running in IPython you can use Tab-completion after ``L.cmd.`` to see + all available LAMMPS commands. + Using these facilities, the previous example shown above can be rewritten as follows: .. code-block:: python - from lammps import PyLammps + from lammps import lammps L = lammps() # read commands from file 'in.melt' diff --git a/doc/src/Python_head.rst b/doc/src/Python_head.rst index 3aab3a0d4b..28b6f3d1d4 100644 --- a/doc/src/Python_head.rst +++ b/doc/src/Python_head.rst @@ -15,6 +15,7 @@ together. Python_call Python_formats Python_examples + Python_jupyter Python_error Python_trouble diff --git a/doc/src/Python_jupyter.rst b/doc/src/Python_jupyter.rst new file mode 100644 index 0000000000..df24bf7506 --- /dev/null +++ b/doc/src/Python_jupyter.rst @@ -0,0 +1,48 @@ +Using LAMMPS in IPython notebooks and Jupyter +============================================= + +If the LAMMPS Python package is installed for the same Python interpreter as +`IPython `_, you can use LAMMPS directly inside of an IPython notebook inside of +Jupyter. `Jupyter `_ is a powerful integrated development environment (IDE) for +many dynamic languages like Python, Julia and others, which operates inside of +any web browser. Besides auto-completion and syntax highlighting it allows you +to create formatted documents using Markup, mathematical formulas, graphics and +animations intermixed with executable Python code. It is a great format for +tutorials and showcasing your latest research. + +The easiest way to install it is via ``pip``: + +.. code-block:: bash + + pip install jupyter + +To launch an instance of Jupyter simply run the following command inside your +Python environment: + +.. code-block:: bash + + jupyter notebook + +.. _ipython: https://ipython.org/ +.. _jupyter: https://jupyter.org/ + +Interactive Python Examples +--------------------------- + +Examples of IPython notebooks can be found in the ``python/examples/ipython`` +subdirectory. They require LAMMPS to be compiled as shared library with PYTHON, +PNG, JPEG and FFMPEG support. + +To open these notebooks launch ``jupyter notebook index.ipynb`` inside this +directory. The opened file provides an overview of the available examples. + +- Example 1: Using LAMMPS with Python (``simple.ipynb``) +- Example 2: Analyzing LAMMPS thermodynamic data (``thermo.ipynb``) +- Example 3: Working with Per-Atom Data (``atoms.ipynb``) +- Example 4: Working with LAMMPS variables (``variables.ipynb``) +- Example 5: Validating a dihedral potential (``dihedrals/dihedral.ipynb``) +- Example 6: Running a Monte Carlo relaxation (``montecarlo/mc.ipynb``) + +.. note:: + + Typically clicking a link in Jupyter will open a new tab, which might be blocked by your pop-up blocker. diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst index 9c60982e1b..30e585d143 100644 --- a/doc/src/Python_module.rst +++ b/doc/src/Python_module.rst @@ -14,9 +14,7 @@ session with the ``import`` command. Alternative interfaces such as :py:class:`PyLammps ` and :py:class:`IPyLammps ` classes have been deprecated and - will be removed in a future version of LAMMPS. The :doc:`Howto_pylammps` has - also been replaced by a reworked :doc:`Howto_python` that showcases how to - use the modern Python API facilities instead. + will be removed in a future version of LAMMPS. .. _mpi4py_url: https://mpi4py.readthedocs.io diff --git a/doc/src/Python_objects.rst b/doc/src/Python_objects.rst index 6e3a329a27..c3002ec5e6 100644 --- a/doc/src/Python_objects.rst +++ b/doc/src/Python_objects.rst @@ -4,95 +4,52 @@ Compute, fixes, variables This section documents accessing or modifying data from objects like computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module. -.. tabs:: +For :py:meth:`lammps.extract_compute() ` and +:py:meth:`lammps.extract_fix() `, the global, per-atom, +or local data calculated by the compute or fix can be accessed. What is returned +depends on whether the compute or fix calculates a scalar or vector or array. +For a scalar, a single double value is returned. If the compute or fix calculates +a vector or array, a pointer to the internal LAMMPS data is returned, which you can +use via normal Python subscripting. - .. tab:: lammps API +The one exception is that for a fix that calculates a +global vector or array, a single double value from the vector or array +is returned, indexed by I (vector) or I and J (array). I,J are +zero-based indices. +See the :doc:`Howto output ` page for a discussion of +global, per-atom, and local data, and of scalar, vector, and array +data types. See the doc pages for individual :doc:`computes ` +and :doc:`fixes ` for a description of what they calculate and +store. - For :py:meth:`lammps.extract_compute() ` and - :py:meth:`lammps.extract_fix() `, the global, per-atom, - or local data calculated by the compute or fix can be accessed. What is returned - depends on whether the compute or fix calculates a scalar or vector or array. - For a scalar, a single double value is returned. If the compute or fix calculates - a vector or array, a pointer to the internal LAMMPS data is returned, which you can - use via normal Python subscripting. +For :py:meth:`lammps.extract_variable() `, +an :doc:`equal-style or atom-style variable ` is evaluated and +its result returned. - The one exception is that for a fix that calculates a - global vector or array, a single double value from the vector or array - is returned, indexed by I (vector) or I and J (array). I,J are - zero-based indices. - See the :doc:`Howto output ` page for a discussion of - global, per-atom, and local data, and of scalar, vector, and array - data types. See the doc pages for individual :doc:`computes ` - and :doc:`fixes ` for a description of what they calculate and - store. +For equal-style variables a single ``c_double`` value is returned and the +group argument is ignored. For atom-style variables, a vector of +``c_double`` is returned, one value per atom, which you can use via normal +Python subscripting. The values will be zero for atoms not in the +specified group. - For :py:meth:`lammps.extract_variable() `, - an :doc:`equal-style or atom-style variable ` is evaluated and - its result returned. +:py:meth:`lammps.numpy.extract_compute() `, +:py:meth:`lammps.numpy.extract_fix() `, and +:py:meth:`lammps.numpy.extract_variable() ` are +equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers. - For equal-style variables a single ``c_double`` value is returned and the - group argument is ignored. For atom-style variables, a vector of - ``c_double`` is returned, one value per atom, which you can use via normal - Python subscripting. The values will be zero for atoms not in the - specified group. +The :py:meth:`lammps.set_variable() ` method sets an +existing string-style variable to a new string value, so that subsequent LAMMPS +commands can access the variable. - :py:meth:`lammps.numpy.extract_compute() `, - :py:meth:`lammps.numpy.extract_fix() `, and - :py:meth:`lammps.numpy.extract_variable() ` are - equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers. +**Methods**: - The :py:meth:`lammps.set_variable() ` method sets an - existing string-style variable to a new string value, so that subsequent LAMMPS - commands can access the variable. +* :py:meth:`lammps.extract_compute() `: extract value(s) from a compute +* :py:meth:`lammps.extract_fix() `: extract value(s) from a fix +* :py:meth:`lammps.extract_variable() `: extract value(s) from a variable +* :py:meth:`lammps.set_variable() `: set existing named string-style variable to value - **Methods**: +**NumPy Methods**: - * :py:meth:`lammps.extract_compute() `: extract value(s) from a compute - * :py:meth:`lammps.extract_fix() `: extract value(s) from a fix - * :py:meth:`lammps.extract_variable() `: extract value(s) from a variable - * :py:meth:`lammps.set_variable() `: set existing named string-style variable to value - - **NumPy Methods**: - - * :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays - - - .. tab:: PyLammps/IPyLammps API - - PyLammps and IPyLammps classes currently do not add any additional ways of - retrieving information out of computes and fixes. This information can still be accessed by using the lammps API: - - .. code-block:: python - - L.lmp.extract_compute(...) - L.lmp.extract_fix(...) - # OR - L.lmp.numpy.extract_compute(...) - L.lmp.numpy.extract_fix(...) - - LAMMPS variables can be both defined and accessed via the :py:class:`PyLammps ` interface. - - To define a variable you can use the :doc:`variable ` command: - - .. code-block:: python - - L.variable("a index 2") - - A dictionary of all variables is returned by the :py:attr:`PyLammps.variables ` property: - - you can access an individual variable by retrieving a variable object from the - ``L.variables`` dictionary by name - - .. code-block:: python - - a = L.variables['a'] - - The variable value can then be easily read and written by accessing the value - property of this object. - - .. code-block:: python - - print(a.value) - a.value = 4 +* :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays +* :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays +* :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays diff --git a/doc/src/Python_properties.rst b/doc/src/Python_properties.rst index 031461660a..25576e90be 100644 --- a/doc/src/Python_properties.rst +++ b/doc/src/Python_properties.rst @@ -2,14 +2,8 @@ System properties ================= Similar to what is described in :doc:`Library_properties`, the instances of -:py:class:`lammps `, :py:class:`PyLammps `, or -:py:class:`IPyLammps ` can be used to extract different kinds -of information about the active LAMMPS instance and also to modify some of it. The -main difference between the interfaces is how the information is exposed. - -While the :py:class:`lammps ` is just a thin layer that wraps C API calls, -:py:class:`PyLammps ` and :py:class:`IPyLammps ` expose -information as objects and properties. +:py:class:`lammps ` can be used to extract different kinds +of information about the active LAMMPS instance and also to modify some of it. In some cases the data returned is a direct reference to the original data inside LAMMPS cast to ``ctypes`` pointers. Where possible, the wrappers will @@ -25,113 +19,38 @@ against invalid accesses. accordingly. These arrays can change sizes and order at every neighbor list rebuild and atom sort event as atoms are migrating between subdomains. -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + lmp = lammps() + lmp.file("in.sysinit") - from lammps import lammps + natoms = lmp.get_natoms() + print(f"running simulation with {natoms} atoms") - lmp = lammps() - lmp.file("in.sysinit") + lmp.command("run 1000 post no"); - natoms = lmp.get_natoms() - print(f"running simulation with {natoms} atoms") + for i in range(10): + lmp.command("run 100 pre no post no") + pe = lmp.get_thermo("pe") + ke = lmp.get_thermo("ke") + print(f"PE = {pe}\nKE = {ke}") - lmp.command("run 1000 post no"); + lmp.close() - for i in range(10): - lmp.command("run 100 pre no post no") - pe = lmp.get_thermo("pe") - ke = lmp.get_thermo("ke") - print(f"PE = {pe}\nKE = {ke}") +**Methods**: - lmp.close() +* :py:meth:`version() `: return the numerical version id, e.g. LAMMPS 2 Sep 2015 -> 20150902 +* :py:meth:`get_thermo() `: return current value of a thermo keyword +* :py:meth:`last_thermo() `: return a dictionary of the last thermodynamic output +* :py:meth:`get_natoms() `: total # of atoms as int +* :py:meth:`reset_box() `: reset the simulation box size +* :py:meth:`extract_setting() `: return a global setting +* :py:meth:`extract_global() `: extract a global quantity +* :py:meth:`extract_box() `: extract box info +* :py:meth:`create_atoms() `: create N atoms with IDs, types, x, v, and image flags - **Methods**: +**Properties**: - * :py:meth:`version() `: return the numerical version id, e.g. LAMMPS 2 Sep 2015 -> 20150902 - * :py:meth:`get_thermo() `: return current value of a thermo keyword - * :py:meth:`last_thermo() `: return a dictionary of the last thermodynamic output - * :py:meth:`get_natoms() `: total # of atoms as int - * :py:meth:`reset_box() `: reset the simulation box size - * :py:meth:`extract_setting() `: return a global setting - * :py:meth:`extract_global() `: extract a global quantity - * :py:meth:`extract_box() `: extract box info - * :py:meth:`create_atoms() `: create N atoms with IDs, types, x, v, and image flags - - **Properties**: - - * :py:attr:`last_thermo_step `: the last timestep thermodynamic output was computed - - .. tab:: PyLammps/IPyLammps API - - In addition to the functions provided by :py:class:`lammps `, :py:class:`PyLammps ` objects - have several properties which allow you to query the system state: - - L.system - Is a dictionary describing the system such as the bounding box or number of atoms - - L.system.xlo, L.system.xhi - bounding box limits along x-axis - - L.system.ylo, L.system.yhi - bounding box limits along y-axis - - L.system.zlo, L.system.zhi - bounding box limits along z-axis - - L.communication - configuration of communication subsystem, such as the number of threads or processors - - L.communication.nthreads - number of threads used by each LAMMPS process - - L.communication.nprocs - number of MPI processes used by LAMMPS - - L.fixes - List of fixes in the current system - - L.computes - List of active computes in the current system - - L.dump - List of active dumps in the current system - - L.groups - List of groups present in the current system - - **Retrieving the value of an arbitrary LAMMPS expressions** - - LAMMPS expressions can be immediately evaluated by using the ``eval`` method. The - passed string parameter can be any expression containing global :doc:`thermo` values, - variables, compute or fix data (see :doc:`Howto_output`): - - - .. code-block:: python - - result = L.eval("ke") # kinetic energy - result = L.eval("pe") # potential energy - - result = L.eval("v_t/2.0") - - **Example** - - .. code-block:: python - - from lammps import PyLammps - - L = PyLammps() - L.file("in.sysinit") - - print(f"running simulation with {L.system.natoms} atoms") - - L.run(1000, "post no"); - - for i in range(10): - L.run(100, "pre no post no") - pe = L.eval("pe") - ke = L.eval("ke") - print(f"PE = {pe}\nKE = {ke}") +* :py:attr:`last_thermo_step `: the last timestep thermodynamic output was computed diff --git a/python/examples/ipython/atoms.ipynb b/python/examples/ipython/atoms.ipynb index 14b60d4e28..a18d6addaa 100644 --- a/python/examples/ipython/atoms.ipynb +++ b/python/examples/ipython/atoms.ipynb @@ -4,7 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example 3: Example 3: Using Atom Data" + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 3: Working with Per-Atom Data" ] }, { @@ -133,248 +140,6 @@ "L.ipython.image(zoom=1.8)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Queries about LAMMPS simulation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system.natoms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system.nbonds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.system.nbondtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.communication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.fixes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.computes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.dumps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with LAMMPS Variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"a index 2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"t equal temp\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if sys.version_info < (3, 0):\n", - " # In Python 2 'print' is a restricted keyword, which is why you have to use the lmp_print function instead.\n", - " x = float(L.lmp_print('\"${a}\"'))\n", - "else:\n", - " # In Python 3 the print function can be redefined.\n", - " # x = float(L.print('\"${a}\"')\")\n", - " \n", - " # To avoid a syntax error in Python 2 executions of this notebook, this line is packed into an eval statement\n", - " x = float(eval(\"L.print('\\\"${a}\\\"')\"))\n", - "x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['t'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.eval(\"v_t/2.0\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"b index a b c\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['b'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.eval(\"v_b\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['b'].definition" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variable(\"i loop 10\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.variables['i'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.next(\"i\")\n", - "L.variables['i'].value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "L.expand(\"ke\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -451,7 +216,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/python/examples/ipython/dihedrals/dihedral.ipynb b/python/examples/ipython/dihedrals/dihedral.ipynb index 4fece8aa58..ed8faeaa86 100644 --- a/python/examples/ipython/dihedrals/dihedral.ipynb +++ b/python/examples/ipython/dihedrals/dihedral.ipynb @@ -4,7 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Validating a dihedral potential" + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 4: Validating a dihedral potential" ] }, { @@ -232,7 +239,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/python/examples/ipython/index.ipynb b/python/examples/ipython/index.ipynb index 9fdac385fd..47d1ec63ce 100644 --- a/python/examples/ipython/index.ipynb +++ b/python/examples/ipython/index.ipynb @@ -5,7 +5,7 @@ "id": "666d3036-47d5-44d2-bc1a-ca4b00a9e9b8", "metadata": {}, "source": [ - "# LAMMPS IPython Tutorial" + "# LAMMPS Python Tutorials" ] }, { @@ -25,7 +25,9 @@ "\n", "- [Example 1: Using LAMMPS with Python](simple.ipynb)\n", "- [Example 2: Analyzing LAMMPS thermodynamic data](thermo.ipynb)\n", - "- [Example 3: Using Atom Data](atom.ipynb)" + "- [Example 3: Using Atom Data](atoms.ipynb)\n", + "- [Example 4: Validating a dihedral potential](dihedrals/dihedral.ipynb)\n", + "- [Example 5: Running a Monte Carlo relaxation](montecarlo/mc.ipynb)" ] }, { @@ -53,7 +55,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/python/examples/ipython/montecarlo/mc.ipynb b/python/examples/ipython/montecarlo/mc.ipynb index b1cfa488eb..d6c1a03e54 100644 --- a/python/examples/ipython/montecarlo/mc.ipynb +++ b/python/examples/ipython/montecarlo/mc.ipynb @@ -4,7 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Monte Carlo Relaxation" + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 5: Monte Carlo Relaxation" ] }, { @@ -343,7 +350,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/python/examples/ipython/simple.ipynb b/python/examples/ipython/simple.ipynb index d45c56db18..77b0844a59 100644 --- a/python/examples/ipython/simple.ipynb +++ b/python/examples/ipython/simple.ipynb @@ -298,7 +298,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/python/examples/ipython/thermo.ipynb b/python/examples/ipython/thermo.ipynb index ea465f5f79..02a7c49e8c 100644 --- a/python/examples/ipython/thermo.ipynb +++ b/python/examples/ipython/thermo.ipynb @@ -279,6 +279,24 @@ "source": [ "current_run.plot(x='Step', y='TotEng')" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "The Python interface gives you a powerful way of invoking and extracting simulation data while the simulation is running. Next we'll look at how to extract information about the atoms in your system.\n", + "\n", + "
Next" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -297,7 +315,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.7" } }, "nbformat": 4, From 7f68aeb6d52ba756e6d1b3cd6857e17012cab32e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 13 Dec 2024 09:23:03 -0700 Subject: [PATCH 030/161] Update Kokkos library in LAMMPS to v4.5.0 --- lib/kokkos/CHANGELOG.md | 101 +- lib/kokkos/CMakeLists.txt | 397 ++- lib/kokkos/CONTRIBUTING.md | 2 + lib/kokkos/HOW_TO_SNAPSHOT | 73 - lib/kokkos/Makefile.kokkos | 280 ++- lib/kokkos/Makefile.targets | 16 +- lib/kokkos/README.md | 6 +- lib/kokkos/algorithms/CMakeLists.txt | 12 +- lib/kokkos/algorithms/src/CMakeLists.txt | 33 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 11 +- .../src/sorting/Kokkos_BinOpsPublicAPI.hpp | 22 +- .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 3 +- .../src/sorting/Kokkos_SortPublicAPI.hpp | 20 +- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 22 +- .../src/sorting/impl/Kokkos_SortImpl.hpp | 44 +- .../src/std_algorithms/Kokkos_Reduce.hpp | 24 +- .../std_algorithms/Kokkos_TransformReduce.hpp | 24 +- .../impl/Kokkos_Constraints.hpp | 21 +- .../impl/Kokkos_MoveBackward.hpp | 2 +- .../impl/Kokkos_RandomAccessIterator.hpp | 20 +- .../std_algorithms/impl/Kokkos_Reverse.hpp | 2 +- .../impl/Kokkos_ReverseCopy.hpp | 2 +- .../algorithms/unit_tests/CMakeLists.txt | 400 ++- .../algorithms/unit_tests/TestBinSortA.hpp | 49 +- .../algorithms/unit_tests/TestBinSortB.hpp | 4 + .../algorithms/unit_tests/TestNestedSort.hpp | 10 + .../algorithms/unit_tests/TestRandom.hpp | 11 +- .../unit_tests/TestRandomAccessIterator.cpp | 33 +- lib/kokkos/algorithms/unit_tests/TestSort.hpp | 15 +- .../algorithms/unit_tests/TestSortByKey.hpp | 20 +- .../TestStdAlgorithmsAdjacentDifference.cpp | 2 +- .../TestStdAlgorithmsAdjacentFind.cpp | 4 +- .../unit_tests/TestStdAlgorithmsCommon.hpp | 28 +- .../TestStdAlgorithmsConstraints.cpp | 19 +- .../unit_tests/TestStdAlgorithmsCopyIf.cpp | 8 +- .../TestStdAlgorithmsExclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsForEach.cpp | 2 - .../TestStdAlgorithmsHelperFunctors.hpp | 2 +- .../TestStdAlgorithmsInclusiveScan.cpp | 2 +- .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 7 +- .../TestStdAlgorithmsIsSortedUntil.cpp | 5 +- .../unit_tests/TestStdAlgorithmsMismatch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModOps.cpp | 2 +- .../unit_tests/TestStdAlgorithmsModSeqOps.cpp | 2 +- .../TestStdAlgorithmsMoveBackward.cpp | 2 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- .../unit_tests/TestStdAlgorithmsRemove.cpp | 4 +- .../TestStdAlgorithmsRemoveCopy.cpp | 2 +- .../TestStdAlgorithmsRemoveCopyIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRemoveIf.cpp | 6 +- .../unit_tests/TestStdAlgorithmsReplace.cpp | 4 +- .../TestStdAlgorithmsReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsReplaceCopyIf.cpp | 4 +- .../unit_tests/TestStdAlgorithmsReplaceIf.cpp | 2 +- .../unit_tests/TestStdAlgorithmsReverse.cpp | 2 +- .../unit_tests/TestStdAlgorithmsRotate.cpp | 2 +- .../TestStdAlgorithmsRotateCopy.cpp | 4 +- .../unit_tests/TestStdAlgorithmsSearch.cpp | 2 +- .../unit_tests/TestStdAlgorithmsSearch_n.cpp | 4 +- .../unit_tests/TestStdAlgorithmsShiftLeft.cpp | 2 +- .../TestStdAlgorithmsShiftRight.cpp | 4 +- ...estStdAlgorithmsTeamAdjacentDifference.cpp | 8 +- .../unit_tests/TestStdAlgorithmsTeamCopy.cpp | 2 +- .../TestStdAlgorithmsTeamCopyIf.cpp | 2 +- .../TestStdAlgorithmsTeamCopy_n.cpp | 2 +- .../unit_tests/TestStdAlgorithmsTeamCount.cpp | 2 +- .../TestStdAlgorithmsTeamExclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamFind.cpp | 2 +- .../TestStdAlgorithmsTeamFindEnd.cpp | 8 +- .../TestStdAlgorithmsTeamFindIf.cpp | 2 +- .../TestStdAlgorithmsTeamFindIfNot.cpp | 2 +- .../TestStdAlgorithmsTeamGenerate_n.cpp | 2 +- .../TestStdAlgorithmsTeamIsSorted.cpp | 2 +- .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 10 +- .../TestStdAlgorithmsTeamMaxElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinElement.cpp | 4 +- .../TestStdAlgorithmsTeamMinMaxElement.cpp | 4 +- .../unit_tests/TestStdAlgorithmsTeamMove.cpp | 2 +- .../TestStdAlgorithmsTeamRemove.cpp | 2 +- .../TestStdAlgorithmsTeamRemoveCopy.cpp | 4 +- .../TestStdAlgorithmsTeamRemoveCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopy.cpp | 4 +- .../TestStdAlgorithmsTeamReplaceCopyIf.cpp | 4 +- .../TestStdAlgorithmsTeamRotateCopy.cpp | 2 +- .../TestStdAlgorithmsTeamShiftRight.cpp | 2 +- .../TestStdAlgorithmsTeamSwapRanges.cpp | 2 +- ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 4 +- .../TestStdAlgorithmsTeamUnique.cpp | 4 +- .../TestStdAlgorithmsTeamUniqueCopy.cpp | 10 +- ...estStdAlgorithmsTransformExclusiveScan.cpp | 4 +- ...estStdAlgorithmsTransformInclusiveScan.cpp | 4 +- .../unit_tests/TestStdAlgorithmsUnique.cpp | 2 +- .../TestStdAlgorithmsUniqueCopy.cpp | 4 +- .../algorithms/unit_tests/TestStdReducers.cpp | 6 +- lib/kokkos/appveyor.yml | 10 - lib/kokkos/benchmarks/CMakeLists.txt | 20 +- lib/kokkos/benchmarks/atomic/CMakeLists.txt | 5 +- .../benchmarks/bytes_and_flops/CMakeLists.txt | 9 +- .../bytes_and_flops/bench_unroll_stride.hpp | 6 +- lib/kokkos/benchmarks/gather/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/gups/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/gups/gups.cpp | 2 +- .../benchmarks/launch_latency/CMakeLists.txt | 5 +- .../launch_latency/launch_latency.cpp | 4 +- .../policy_performance/CMakeLists.txt | 5 +- lib/kokkos/benchmarks/stream/CMakeLists.txt | 5 +- .../view_copy_constructor/CMakeLists.txt | 5 +- lib/kokkos/bin/kokkos_launch_compiler | 4 +- lib/kokkos/cmake/Dependencies.cmake | 6 +- lib/kokkos/cmake/KokkosCore_config.h.in | 10 +- .../cmake/KokkosTrilinosConfig.cmake.in | 17 - lib/kokkos/cmake/Modules/CudaToolkit.cmake | 196 +- lib/kokkos/cmake/Modules/FindTPLCUDA.cmake | 68 +- lib/kokkos/cmake/Modules/FindTPLHPX.cmake | 11 +- lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake | 2 +- lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake | 2 +- .../cmake/Modules/FindTPLLIBQUADMATH.cmake | 20 +- lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake | 66 +- lib/kokkos/cmake/Modules/FindTPLROCM.cmake | 22 +- .../cmake/Modules/FindTPLROCTHRUST.cmake | 10 +- lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake | 23 +- lib/kokkos/cmake/README.md | 14 - lib/kokkos/cmake/build_env_info.cmake | 99 +- .../compile_tests/amd_apu.cc} | 24 +- lib/kokkos/cmake/cray.cmake | 11 +- lib/kokkos/cmake/deps/CUDA.cmake | 30 +- lib/kokkos/cmake/deps/HWLOC.cmake | 6 +- lib/kokkos/cmake/deps/Pthread.cmake | 38 +- lib/kokkos/cmake/deps/quadmath.cmake | 5 +- lib/kokkos/cmake/fake_tribits.cmake | 469 ++-- lib/kokkos/cmake/gnu.cmake | 38 +- lib/kokkos/cmake/intel.cmake | 29 +- lib/kokkos/cmake/kokkos_arch.cmake | 2153 +++++++++-------- lib/kokkos/cmake/kokkos_check_env.cmake | 27 +- lib/kokkos/cmake/kokkos_compiler_id.cmake | 437 ++-- .../cmake/kokkos_configure_trilinos.cmake | 38 + lib/kokkos/cmake/kokkos_corner_cases.cmake | 12 +- lib/kokkos/cmake/kokkos_enable_devices.cmake | 204 +- lib/kokkos/cmake/kokkos_enable_options.cmake | 336 +-- lib/kokkos/cmake/kokkos_functions.cmake | 1207 +++++---- lib/kokkos/cmake/kokkos_install.cmake | 76 +- lib/kokkos/cmake/kokkos_pick_cxx_std.cmake | 36 +- lib/kokkos/cmake/kokkos_test_cxx_std.cmake | 285 +-- lib/kokkos/cmake/kokkos_tpls.cmake | 224 +- lib/kokkos/cmake/kokkos_tribits.cmake | 702 +++--- lib/kokkos/cmake/msvc.cmake | 20 +- lib/kokkos/cmake/pgi.cmake | 10 +- lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake | 7 +- lib/kokkos/cmake/tpls/FindTPLPthread.cmake | 35 +- lib/kokkos/cmake/tpls/FindTPLquadmath.cmake | 5 +- lib/kokkos/containers/CMakeLists.txt | 14 +- .../performance_tests/CMakeLists.txt | 17 +- .../performance_tests/TestScatterView.hpp | 8 +- lib/kokkos/containers/src/CMakeLists.txt | 34 +- lib/kokkos/containers/src/Kokkos_Bitset.hpp | 2 +- lib/kokkos/containers/src/Kokkos_DualView.hpp | 241 +- .../containers/src/Kokkos_DynRankView.hpp | 1455 ++++------- .../containers/src/Kokkos_DynamicView.hpp | 104 +- .../containers/src/Kokkos_OffsetView.hpp | 961 ++------ .../containers/src/Kokkos_ScatterView.hpp | 78 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 2 +- .../containers/src/Kokkos_UnorderedMap.hpp | 16 +- lib/kokkos/containers/src/Kokkos_Vector.hpp | 5 +- .../containers/unit_tests/CMakeLists.txt | 67 +- .../containers/unit_tests/TestBitset.hpp | 2 +- .../containers/unit_tests/TestDualView.hpp | 117 +- .../unit_tests/TestDynRankViewTypedefs.cpp | 260 ++ .../TestDynRankView_TeamScratch.hpp | 72 + .../containers/unit_tests/TestDynViewAPI.hpp | 25 +- .../containers/unit_tests/TestDynamicView.hpp | 33 +- .../unit_tests/TestErrorReporter.hpp | 5 +- .../containers/unit_tests/TestOffsetView.hpp | 210 +- .../containers/unit_tests/TestScatterView.hpp | 18 +- .../unit_tests/TestStaticCrsGraph.hpp | 18 +- .../unit_tests/TestUnorderedMap.hpp | 5 +- .../TestViewCtorPropEmbeddedDim.hpp | 16 +- .../unit_tests/TestWithoutInitializing.hpp | 28 +- lib/kokkos/core/CMakeLists.txt | 30 +- lib/kokkos/core/perf_test/CMakeLists.txt | 246 +- lib/kokkos/core/perf_test/PerfTestHexGrad.cpp | 4 +- .../perf_test/PerfTest_CustomReduction.cpp | 2 - .../PerfTest_ExecSpacePartitioning.cpp | 3 +- .../core/perf_test/PerfTest_ViewCopy_Raw.cpp | 2 - .../core/perf_test/PerfTest_ViewFill_Raw.cpp | 2 - .../perf_test/PerfTest_ViewResize_Raw.cpp | 2 - lib/kokkos/core/perf_test/test_mempool.cpp | 4 +- .../core/perf_test/test_sharedSpace.cpp | 2 +- lib/kokkos/core/perf_test/test_taskdag.cpp | 9 + lib/kokkos/core/src/CMakeLists.txt | 292 ++- lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp | 1 - lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 9 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp | 18 +- .../src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 41 +- .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 32 +- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 24 +- .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 13 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 58 +- .../src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 2 +- .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 37 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 11 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 188 +- .../src/Cuda/Kokkos_Cuda_Vectorization.hpp | 18 +- .../core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 11 +- lib/kokkos/core/src/HIP/Kokkos_HIP.cpp | 60 +- .../HIP/Kokkos_HIP_BlockSize_Deduction.hpp | 5 +- .../src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 43 +- .../core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 37 +- .../core/src/HIP/Kokkos_HIP_Instance.cpp | 40 +- .../core/src/HIP/Kokkos_HIP_Instance.hpp | 10 +- .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 44 +- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 6 +- .../src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 4 +- .../src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 20 +- .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 79 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 2 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 2 +- .../src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp | 10 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp | 49 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp | 44 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp | 191 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 3 +- .../core/src/HIP/Kokkos_HIP_Vectorization.hpp | 22 +- .../core/src/HIP/Kokkos_HIP_ZeroMemset.cpp | 36 + .../core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 21 +- lib/kokkos/core/src/HPX/Kokkos_HPX.hpp | 117 +- lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp | 11 + .../core/src/KokkosExp_MDRangePolicy.hpp | 94 +- lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp | 8 +- lib/kokkos/core/src/Kokkos_Array.hpp | 40 +- lib/kokkos/core/src/Kokkos_Atomic.hpp | 1 - .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 196 -- .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 305 +-- lib/kokkos/core/src/Kokkos_Complex.hpp | 30 +- lib/kokkos/core/src/Kokkos_Concepts.hpp | 51 +- lib/kokkos/core/src/Kokkos_CopyViews.hpp | 346 ++- lib/kokkos/core/src/Kokkos_Core.hpp | 10 +- lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 9 +- lib/kokkos/core/src/Kokkos_Crs.hpp | 14 +- lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp | 6 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 156 +- lib/kokkos/core/src/Kokkos_Extents.hpp | 2 +- lib/kokkos/core/src/Kokkos_Future.hpp | 37 +- lib/kokkos/core/src/Kokkos_Graph.hpp | 69 +- lib/kokkos/core/src/Kokkos_GraphNode.hpp | 86 +- lib/kokkos/core/src/Kokkos_HostSpace.hpp | 33 +- lib/kokkos/core/src/Kokkos_Layout.hpp | 36 +- lib/kokkos/core/src/Kokkos_Macros.hpp | 64 +- lib/kokkos/core/src/Kokkos_MemoryPool.hpp | 11 +- lib/kokkos/core/src/Kokkos_NumericTraits.hpp | 2 +- lib/kokkos/core/src/Kokkos_Pair.hpp | 6 +- lib/kokkos/core/src/Kokkos_Parallel.hpp | 24 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 181 +- .../src/Kokkos_Profiling_ProfileSection.hpp | 2 +- .../src/Kokkos_Profiling_ScopedRegion.hpp | 2 +- lib/kokkos/core/src/Kokkos_ScratchSpace.hpp | 2 +- lib/kokkos/core/src/Kokkos_TaskScheduler.hpp | 68 +- .../core/src/Kokkos_TaskScheduler_fwd.hpp | 43 +- lib/kokkos/core/src/Kokkos_Timer.hpp | 2 +- lib/kokkos/core/src/Kokkos_Tuners.hpp | 125 +- lib/kokkos/core/src/Kokkos_TypeInfo.hpp | 103 + lib/kokkos/core/src/Kokkos_View.hpp | 2014 +-------------- .../core/src/Kokkos_WorkGraphPolicy.hpp | 4 +- .../core/src/OpenACC/Kokkos_OpenACC.cpp | 55 + .../core/src/OpenACC/Kokkos_OpenACC.hpp | 5 +- .../core/src/OpenACC/Kokkos_OpenACCSpace.hpp | 10 + .../OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Instance.cpp | 15 +- .../src/OpenACC/Kokkos_OpenACC_Instance.hpp | 3 +- .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 620 +++-- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 560 ++++- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 111 +- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 2 +- .../src/OpenACC/Kokkos_OpenACC_Traits.hpp | 5 +- lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp | 14 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 11 + .../src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 2 - .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 81 +- .../Kokkos_OpenMPTarget_DeepCopy.hpp | 101 + .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 130 - .../Kokkos_OpenMPTarget_FunctorAdapter.hpp | 48 + .../Kokkos_OpenMPTarget_Instance.cpp | 88 +- .../Kokkos_OpenMPTarget_Instance.hpp | 21 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 41 +- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 129 +- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 24 +- .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 41 +- ...os_OpenMPTarget_ParallelReduce_MDRange.hpp | 316 +-- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 24 +- ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 76 +- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 58 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 216 +- .../Kokkos_OpenMPTarget_Reducer.hpp | 160 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp | 251 -- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp | 319 --- lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 12 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 36 +- .../src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp | 37 +- .../src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp | 14 +- .../core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 64 +- .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 49 +- .../core/src/SYCL/Kokkos_SYCL_Instance.hpp | 45 +- .../src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp | 11 +- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 13 +- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 18 +- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 24 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 16 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 19 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 29 +- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 +- .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 32 +- .../core/src/SYCL/Kokkos_SYCL_Space.hpp | 106 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp | 163 +- .../core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp | 5 +- .../core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 15 +- .../core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 11 +- lib/kokkos/core/src/Serial/Kokkos_Serial.hpp | 5 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 10 + .../Serial/Kokkos_Serial_Parallel_Range.hpp | 38 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 20 +- .../core/src/Serial/Kokkos_Serial_Task.hpp | 15 +- .../Serial/Kokkos_Serial_WorkGraphPolicy.hpp | 4 +- .../src/Serial/Kokkos_Serial_ZeroMemset.hpp | 12 +- .../src/Threads/Kokkos_Threads_Instance.cpp | 20 +- .../src/Threads/Kokkos_Threads_Instance.hpp | 8 +- .../Kokkos_Threads_ParallelFor_MDRange.hpp | 4 +- .../Kokkos_Threads_ParallelFor_Range.hpp | 8 +- .../Kokkos_Threads_ParallelFor_Team.hpp | 24 +- .../Kokkos_Threads_ParallelReduce_MDRange.hpp | 10 +- .../Kokkos_Threads_ParallelReduce_Range.hpp | 8 +- .../Kokkos_Threads_ParallelReduce_Team.hpp | 13 +- .../Kokkos_Threads_ParallelScan_Range.hpp | 8 +- .../src/Threads/Kokkos_Threads_Spinwait.cpp | 2 +- .../src/Threads/Kokkos_Threads_Spinwait.hpp | 3 +- .../core/src/Threads/Kokkos_Threads_Team.hpp | 171 +- .../Kokkos_Threads_WorkGraphPolicy.hpp | 4 +- lib/kokkos/core/src/View/Kokkos_BasicView.hpp | 652 +++++ lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 308 +-- .../Kokkos_ViewAtomic.hpp} | 10 +- .../src/{impl => View}/Kokkos_ViewCtor.hpp | 87 +- .../Kokkos_ViewDataAnalysis.hpp | 15 +- .../core/src/View/Kokkos_ViewLegacy.hpp | 1604 ++++++++++++ .../src/{impl => View}/Kokkos_ViewMapping.hpp | 453 ++-- .../src/{impl => View}/Kokkos_ViewTracker.hpp | 0 .../core/src/View/Kokkos_ViewTraits.hpp | 457 ++++ .../{impl => View}/Kokkos_ViewUniformType.hpp | 12 +- .../View/MDSpan/Kokkos_MDSpan_Accessor.hpp | 203 +- .../src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 119 +- .../core/src/decl/Kokkos_Declare_CUDA.hpp | 2 + .../core/src/decl/Kokkos_Declare_SYCL.hpp | 10 + lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp | 2 - .../src/impl/KokkosExp_Host_IterateTile.hpp | 71 +- .../src/impl/KokkosExp_IterateTileGPU.hpp | 8 +- .../core/src/impl/Kokkos_AnalyzePolicy.hpp | 2 +- lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp | 16 +- lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp | 11 +- .../core/src/impl/Kokkos_Combined_Reducer.hpp | 26 +- .../core/src/impl/Kokkos_ConcurrentBitset.hpp | 30 +- lib/kokkos/core/src/impl/Kokkos_Core.cpp | 25 +- .../impl/Kokkos_Default_GraphNodeKernel.hpp | 39 +- .../impl/Kokkos_Default_GraphNode_Impl.hpp | 30 +- .../src/impl/Kokkos_Default_Graph_Impl.hpp | 37 +- lib/kokkos/core/src/impl/Kokkos_EBO.hpp | 20 +- .../core/src/impl/Kokkos_ExecPolicy.cpp | 2 +- .../core/src/impl/Kokkos_ExecSpaceManager.hpp | 8 +- .../src/impl/Kokkos_FixedBufferMemoryPool.hpp | 279 --- .../core/src/impl/Kokkos_FunctorAnalysis.hpp | 62 +- lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp | 6 +- .../src/impl/Kokkos_GraphImpl_Utilities.hpp | 6 +- .../impl/Kokkos_GraphNodeCustomization.hpp | 2 +- .../core/src/impl/Kokkos_GraphNodeImpl.hpp | 43 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 26 +- .../core/src/impl/Kokkos_HostBarrier.hpp | 6 +- lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 8 +- .../src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 9 +- .../src/impl/Kokkos_HostSpace_deepcopy.cpp | 23 +- .../src/impl/Kokkos_HostSpace_deepcopy.hpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.cpp | 6 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 158 +- lib/kokkos/core/src/impl/Kokkos_LIFO.hpp | 8 +- .../core/src/impl/Kokkos_LinkedListNode.hpp | 2 +- .../src/impl/Kokkos_MemoryPoolAllocator.hpp | 103 - .../src/impl/Kokkos_MultipleTaskQueue.hpp | 57 +- lib/kokkos/core/src/impl/Kokkos_Profiling.cpp | 2 - lib/kokkos/core/src/impl/Kokkos_Profiling.hpp | 30 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 2 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 9 + .../core/src/impl/Kokkos_SharedAlloc.cpp | 29 +- .../core/src/impl/Kokkos_SharedAlloc.hpp | 18 +- .../src/impl/Kokkos_SimpleTaskScheduler.hpp | 9 + .../core/src/impl/Kokkos_SingleTaskQueue.hpp | 12 +- .../core/src/impl/Kokkos_Stacktrace.cpp | 8 +- .../src/impl/Kokkos_StringManipulation.hpp | 10 +- lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp | 28 +- lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp | 22 +- .../core/src/impl/Kokkos_TaskPolicyData.hpp | 16 +- lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp | 22 +- .../core/src/impl/Kokkos_TaskQueueCommon.hpp | 12 +- .../impl/Kokkos_TaskQueueMemoryManager.hpp | 5 + .../src/impl/Kokkos_TaskQueueMultiple.hpp | 22 +- .../impl/Kokkos_TaskQueueMultiple_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 5 + .../core/src/impl/Kokkos_TaskResult.hpp | 5 + .../core/src/impl/Kokkos_TaskTeamMember.hpp | 8 +- .../core/src/impl/Kokkos_Tools_Generic.hpp | 304 ++- lib/kokkos/core/src/impl/Kokkos_Traits.hpp | 5 +- .../core/src/impl/Kokkos_ZeroMemset_fwd.hpp | 2 +- lib/kokkos/core/src/impl/Kokkos_hwloc.cpp | 2 +- .../core/src/setup/Kokkos_Setup_Cuda.hpp | 8 + .../core/src/setup/Kokkos_Setup_HIP.hpp | 17 + .../core/src/setup/Kokkos_Setup_SYCL.hpp | 8 + .../core/src/traits/Kokkos_IndexTypeTrait.hpp | 6 +- .../traits/Kokkos_IterationPatternTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 9 +- .../core/src/traits/Kokkos_WorkTagTrait.hpp | 4 +- lib/kokkos/core/unit_test/CMakeLists.txt | 1455 +++++------ .../core/unit_test/IncrementalTest.cpp.in | 2 - lib/kokkos/core/unit_test/Makefile | 21 +- lib/kokkos/core/unit_test/TestAbort.hpp | 11 +- lib/kokkos/core/unit_test/TestArray.cpp | 96 + lib/kokkos/core/unit_test/TestArrayOps.hpp | 5 + .../core/unit_test/TestAtomicOperations.hpp | 138 +- .../TestAtomicOperations_complexdouble.hpp | 2 +- .../TestAtomicOperations_complexfloat.hpp | 4 + .../unit_test/TestAtomicOperations_double.hpp | 4 + .../unit_test/TestAtomicOperations_float.hpp | 4 + .../unit_test/TestAtomicOperations_int.hpp | 4 + .../TestAtomicOperations_longint.hpp | 4 + .../TestAtomicOperations_longlongint.hpp | 4 + .../unit_test/TestAtomicOperations_shared.hpp | 4 + .../TestAtomicOperations_unsignedint.hpp | 4 + .../TestAtomicOperations_unsignedlongint.hpp | 4 + ...stAtomicOperations_unsignedlonglongint.hpp | 4 + lib/kokkos/core/unit_test/TestAtomicViews.hpp | 291 +-- lib/kokkos/core/unit_test/TestAtomics.hpp | 283 +-- .../unit_test/TestBitManipulationBuiltins.hpp | 4 +- ...e_d.cpp => TestCStyleMemoryManagement.cpp} | 27 +- lib/kokkos/core/unit_test/TestCTestDevice.cpp | 76 +- lib/kokkos/core/unit_test/TestCXX11.hpp | 21 +- .../core/unit_test/TestCompilerMacros.cpp | 6 +- lib/kokkos/core/unit_test/TestComplex.hpp | 48 +- lib/kokkos/core/unit_test/TestConcepts.hpp | 5 +- .../core/unit_test/TestDeepCopyAlignment.hpp | 6 +- .../core/unit_test/TestDetectionIdiom.cpp | 16 +- .../unit_test/TestExecSpacePartitioning.hpp | 10 +- .../unit_test/TestExecSpaceThreadSafety.hpp | 53 +- .../core/unit_test/TestExecutionSpace.hpp | 3 +- .../core/unit_test/TestFunctorAnalysis.hpp | 30 +- lib/kokkos/core/unit_test/TestGraph.hpp | 564 ++++- .../core/unit_test/TestHalfConversion.hpp | 4 - .../core/unit_test/TestHalfOperators.hpp | 121 +- .../TestHostSharedPtrAccessOnDevice.hpp | 16 +- lib/kokkos/core/unit_test/TestInit.hpp | 3 - .../unit_test/TestInitializationSettings.cpp | 11 +- lib/kokkos/core/unit_test/TestInterOp.cpp | 75 +- .../core/unit_test/TestIrregularLayout.hpp | 4 +- .../core/unit_test/TestLocalDeepCopy.hpp | 2 - lib/kokkos/core/unit_test/TestMDRange.hpp | 36 +- .../TestMDRangePolicyConstructors.hpp | 59 + .../core/unit_test/TestMDRangeReduce.hpp | 2 - lib/kokkos/core/unit_test/TestMDRange_g.hpp | 2 +- .../core/unit_test/TestMDSpanConversion.hpp | 51 + .../unit_test/TestMathematicalFunctions.hpp | 213 +- .../TestMathematicalSpecialFunctions.hpp | 40 +- lib/kokkos/core/unit_test/TestMemoryPool.hpp | 5 +- .../core/unit_test/TestNestedReducerCTAD.cpp | 8 +- .../core/unit_test/TestNumericTraits.hpp | 26 +- .../TestParseCmdLineArgsAndEnvVars.cpp | 5 +- lib/kokkos/core/unit_test/TestRange.hpp | 25 - .../unit_test/TestRangePolicyConstructors.hpp | 105 +- .../core/unit_test/TestRangePolicyRequire.hpp | 25 - lib/kokkos/core/unit_test/TestReduce.hpp | 47 +- .../unit_test/TestReduceCombinatorical.hpp | 26 +- lib/kokkos/core/unit_test/TestReducers.hpp | 267 +- lib/kokkos/core/unit_test/TestSharedAlloc.hpp | 4 +- lib/kokkos/core/unit_test/TestSharedSpace.cpp | 8 +- .../TestSpaceAwareAccessorAccessViolation.hpp | 2 +- lib/kokkos/core/unit_test/TestStackTrace.hpp | 2 + .../core/unit_test/TestTaskScheduler.hpp | 53 +- lib/kokkos/core/unit_test/TestTeam.hpp | 315 ++- lib/kokkos/core/unit_test/TestTeamBasic.hpp | 2 +- .../unit_test/TestTeamCombinedReducers.hpp | 6 - lib/kokkos/core/unit_test/TestTeamMDRange.hpp | 10 +- .../unit_test/TestTeamMDRangePolicyCTAD.cpp | 4 +- .../core/unit_test/TestTeamReductionScan.hpp | 109 +- lib/kokkos/core/unit_test/TestTeamScan.hpp | 21 +- lib/kokkos/core/unit_test/TestTeamScratch.hpp | 2 - lib/kokkos/core/unit_test/TestTeamVector.hpp | 22 +- lib/kokkos/core/unit_test/TestTypeInfo.cpp | 74 + lib/kokkos/core/unit_test/TestTypeList.cpp | 8 +- lib/kokkos/core/unit_test/TestUtilities.hpp | 12 +- lib/kokkos/core/unit_test/TestViewAPI.hpp | 34 +- lib/kokkos/core/unit_test/TestViewAPI_b.hpp | 30 + lib/kokkos/core/unit_test/TestViewAPI_e.hpp | 23 +- .../core/unit_test/TestViewBadAlloc.hpp | 12 +- lib/kokkos/core/unit_test/TestViewCopy_b.hpp | 4 +- .../core/unit_test/TestViewCtorDimMatch.hpp | 46 +- .../core/unit_test/TestViewCtorProp.hpp | 95 + .../unit_test/TestViewCtorPropEmbeddedDim.hpp | 16 +- .../core/unit_test/TestViewIsAssignable.hpp | 40 +- .../core/unit_test/TestViewMapping_a.hpp | 213 +- .../unit_test/TestViewMapping_subview.hpp | 8 +- .../TestViewMemoryAccessViolation.hpp | 2 +- .../unit_test/TestViewOutOfBoundsAccess.hpp | 2 +- lib/kokkos/core/unit_test/TestViewRank.cpp | 4 +- lib/kokkos/core/unit_test/TestViewSubview.hpp | 182 +- .../core/unit_test/TestViewTypedefs.cpp | 274 +++ lib/kokkos/core/unit_test/TestView_64bit.hpp | 2 - .../unit_test/TestWithoutInitializing.hpp | 50 +- .../UnitTest_CMakePassCmdLineArgs.cpp | 9 +- .../UnitTest_CMakeTriBITSCompatibility.cpp | 33 + .../TestCudaHostPinned_Category.hpp | 4 +- .../TestSYCLHostUSM_Category.hpp | 2 +- .../TestSYCLSharedUSM_Category.hpp | 2 +- .../category_files/TestSYCL_Category.hpp | 2 +- .../unit_test/cuda/TestCuda_InterOp_Graph.cpp | 151 ++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 2 +- .../default/TestDefaultDeviceTypeViewAPI.cpp | 32 +- lib/kokkos/core/unit_test/diffconfig.sh | 18 - .../headers_self_contained/CMakeLists.txt | 14 +- .../unit_test/hip/TestHIP_InterOp_Graph.cpp | 127 + .../core/unit_test/hip/TestHIP_Spaces.cpp | 6 +- .../hip/TestHIP_UnifiedMemory_ZeroMemset.cpp | 44 + .../incremental/Test01_execspace.hpp | 4 + .../Test04_ParallelFor_RangePolicy.hpp | 2 +- .../Test05_ParallelReduce_RangePolicy.hpp | 8 +- .../incremental/Test10_HierarchicalBasics.hpp | 4 +- .../Test11a_ParallelFor_TeamThreadRange.hpp | 2 +- .../Test11b_ParallelFor_TeamVectorRange.hpp | 2 +- .../Test11c_ParallelFor_ThreadVectorRange.hpp | 2 +- .../incremental/Test12a_ThreadScratch.hpp | 10 +- .../incremental/Test12b_TeamScratch.hpp | 7 +- .../Test13c_ParallelRed_ThreadVectorRange.hpp | 2 +- .../incremental/Test16_ParallelScan.hpp | 6 +- .../unit_test/sycl/TestSYCL_InterOp_Graph.cpp | 114 + .../unit_test/sycl/TestSYCL_InterOp_Init.cpp | 2 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 11 +- .../sycl/TestSYCL_InterOp_Streams.cpp | 2 +- .../core/unit_test/sycl/TestSYCL_Spaces.cpp | 273 +-- .../sycl/TestSYCL_TeamScratchStreams.cpp | 34 +- lib/kokkos/core/unit_test/testmake.sh | 18 - .../unit_test/tools/TestEventCorrectness.hpp | 41 +- .../core/unit_test/tools/TestKernelNames.cpp | 219 ++ .../unit_test/tools/TestProfilingSection.cpp | 12 +- .../core/unit_test/tools/TestScopedRegion.cpp | 12 +- .../core/unit_test/tools/TestTuning.cpp | 14 +- .../tools/include/ToolTestingUtilities.hpp | 144 +- .../core/unit_test/view/TestBasicView.hpp | 264 ++ .../view/TestBasicViewMDSpanConversion.cpp | 95 + .../view/TestExtentsDatatypeConversion.cpp | 6 +- .../view/TestReferenceCountedAccessor.hpp | 156 ++ .../view/TestReferenceCountedDataHandle.hpp | 208 ++ lib/kokkos/example/CMakeLists.txt | 11 +- .../example/query_device/CMakeLists.txt | 15 +- .../example/query_device/query_device.cpp | 2 +- .../relocatable_function/CMakeLists.txt | 6 + .../example/relocatable_function/Makefile | 33 + .../relocatable_function/functor.cpp} | 6 +- .../example/relocatable_function/main.cpp | 50 + .../tutorial/01_hello_world/CMakeLists.txt | 11 +- .../01_hello_world_lambda/CMakeLists.txt | 11 +- .../hello_world_lambda.cpp | 5 +- .../tutorial/02_simple_reduce/CMakeLists.txt | 10 +- .../02_simple_reduce_lambda/CMakeLists.txt | 11 +- .../simple_reduce_lambda.cpp | 14 +- .../tutorial/03_simple_view/CMakeLists.txt | 10 +- .../tutorial/03_simple_view/simple_view.cpp | 2 +- .../03_simple_view_lambda/CMakeLists.txt | 10 +- .../simple_view_lambda.cpp | 26 +- .../04_simple_memoryspaces/CMakeLists.txt | 10 +- .../simple_memoryspaces.cpp | 2 +- .../tutorial/05_simple_atomics/CMakeLists.txt | 11 +- .../06_simple_mdrangepolicy/CMakeLists.txt | 10 +- .../01_data_layouts/CMakeLists.txt | 10 +- .../02_memory_traits/CMakeLists.txt | 10 +- .../Advanced_Views/03_subviews/CMakeLists.txt | 10 +- .../04_dualviews/CMakeLists.txt | 10 +- .../Advanced_Views/04_dualviews/dual_view.cpp | 6 +- .../05_NVIDIA_UVM/CMakeLists.txt | 16 +- .../tutorial/Advanced_Views/CMakeLists.txt | 15 +- .../01_random_numbers/CMakeLists.txt | 5 + .../tutorial/Algorithms/CMakeLists.txt | 1 + lib/kokkos/example/tutorial/CMakeLists.txt | 26 +- .../01_thread_teams/CMakeLists.txt | 10 +- .../01_thread_teams_lambda/CMakeLists.txt | 11 +- .../thread_teams_lambda.cpp | 5 +- .../02_nested_parallel_for/CMakeLists.txt | 10 +- .../03_vectorization/CMakeLists.txt | 11 +- .../04_team_scan/CMakeLists.txt | 11 +- .../Hierarchical_Parallelism/CMakeLists.txt | 10 +- .../tutorial/launch_bounds/CMakeLists.txt | 10 +- .../launch_bounds/launch_bounds_reduce.cpp | 5 +- lib/kokkos/master_history.txt | 1 + lib/kokkos/simd/CMakeLists.txt | 8 +- lib/kokkos/simd/src/CMakeLists.txt | 25 +- lib/kokkos/simd/src/Kokkos_SIMD.hpp | 2 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 72 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp | 98 +- lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp | 80 +- lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp | 2 +- lib/kokkos/simd/unit_tests/CMakeLists.txt | 12 +- .../unit_tests/include/SIMDTesting_Ops.hpp | 4 + .../unit_tests/include/TestSIMD_MathOps.hpp | 15 +- .../include/TestSIMD_Reductions.hpp | 7 + .../include/TestSIMD_WhereExpressions.hpp | 8 +- lib/kokkos/tpls/.clang-format | 1 - .../include/desul/atomics/Atomic_Ref.hpp | 16 + .../desul/atomics/Compare_Exchange_SYCL.hpp | 8 + .../atomics/Lock_Based_Fetch_Op_SYCL.hpp | 8 + .../experimental/__p0009_bits/layout_left.hpp | 3 + .../__p0009_bits/layout_right.hpp | 3 + .../__p0009_bits/layout_stride.hpp | 36 +- .../experimental/__p0009_bits/utility.hpp | 100 + .../__p2630_bits/submdspan_extents.hpp | 119 +- .../__p2630_bits/submdspan_mapping.hpp | 58 +- .../__p2642_bits/layout_padded.hpp | 26 +- .../__p2642_bits/layout_padded_fwd.hpp | 6 + 617 files changed, 21499 insertions(+), 17255 deletions(-) delete mode 100644 lib/kokkos/HOW_TO_SNAPSHOT delete mode 100644 lib/kokkos/appveyor.yml delete mode 100644 lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in rename lib/kokkos/{core/unit_test/sycl/TestSYCL_Task.cpp => cmake/compile_tests/amd_apu.cc} (57%) create mode 100644 lib/kokkos/cmake/kokkos_configure_trilinos.cmake create mode 100644 lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp create mode 100644 lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp create mode 100644 lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp delete mode 100644 lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp create mode 100644 lib/kokkos/core/src/Kokkos_TypeInfo.hpp create mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp create mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp create mode 100644 lib/kokkos/core/src/View/Kokkos_BasicView.hpp rename lib/kokkos/core/src/{impl/Kokkos_Atomic_View.hpp => View/Kokkos_ViewAtomic.hpp} (96%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewCtor.hpp (84%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewDataAnalysis.hpp (96%) create mode 100644 lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewMapping.hpp (90%) rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewTracker.hpp (100%) create mode 100644 lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp rename lib/kokkos/core/src/{impl => View}/Kokkos_ViewUniformType.hpp (88%) delete mode 100644 lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp rename lib/kokkos/core/unit_test/{default/TestDefaultDeviceType_d.cpp => TestCStyleMemoryManagement.cpp} (73%) create mode 100644 lib/kokkos/core/unit_test/TestTypeInfo.cpp create mode 100644 lib/kokkos/core/unit_test/TestViewCtorProp.hpp create mode 100644 lib/kokkos/core/unit_test/TestViewTypedefs.cpp create mode 100644 lib/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp delete mode 100755 lib/kokkos/core/unit_test/diffconfig.sh create mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp create mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp create mode 100644 lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp delete mode 100755 lib/kokkos/core/unit_test/testmake.sh create mode 100644 lib/kokkos/core/unit_test/tools/TestKernelNames.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestBasicView.hpp create mode 100644 lib/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp create mode 100644 lib/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp create mode 100644 lib/kokkos/example/relocatable_function/CMakeLists.txt create mode 100644 lib/kokkos/example/relocatable_function/Makefile rename lib/kokkos/{core/src/impl/KokkosExp_ViewMapping.hpp => example/relocatable_function/functor.cpp} (81%) create mode 100644 lib/kokkos/example/relocatable_function/main.cpp create mode 100644 lib/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt create mode 100644 lib/kokkos/example/tutorial/Algorithms/CMakeLists.txt diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 7b1d69e566..6c237ebca8 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,7 +1,101 @@ # CHANGELOG +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + ## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) -[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) ### Features: * Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) @@ -13,7 +107,7 @@ ### Bug Fixes * OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) -* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) @@ -57,6 +151,7 @@ * SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) * Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) * Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) ### Build System Changes * Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) @@ -1388,7 +1483,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 736cbac218..f0bf8e3634 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,143 +14,142 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() # Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +get_directory_property(HAS_PARENT PARENT_DIRECTORY) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) - -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() +list(APPEND CMAKE_MODULE_PATH cmake/Modules) -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) + +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,102 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - ENDIF() - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -327,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/lib/kokkos/CONTRIBUTING.md b/lib/kokkos/CONTRIBUTING.md index b4f3057cef..e97f8c4d89 100644 --- a/lib/kokkos/CONTRIBUTING.md +++ b/lib/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/lib/kokkos/HOW_TO_SNAPSHOT b/lib/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4..0000000000 --- a/lib/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher for questions> - ------------------------------------------------------------------------- - diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 6b627dcc36..75dcbb9536 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -40,16 +40,19 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async -KOKKOS_CUDA_OPTIONS ?= "disable_malloc_async" +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async +KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) @@ -92,7 +95,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -103,6 +106,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -178,7 +183,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -292,6 +297,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -411,8 +418,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -466,6 +473,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -561,6 +576,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -733,7 +751,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -1024,86 +1042,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1119,6 +1173,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1126,43 +1183,43 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1103 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1171,8 +1228,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1182,6 +1239,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1235,6 +1307,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1322,6 +1396,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1374,6 +1450,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1484,7 +1602,11 @@ else endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") endif @@ -1512,6 +1634,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index e8e429e027..be535eea3e 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -16,8 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp @@ -38,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -73,6 +75,8 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif @@ -89,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index c8c6f8f7cf..0ea07f9ea2 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). +The current release is [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index 368984647e..73ce9f7ec5 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt index b490caca62..9f10b85e02 100644 --- a/lib/kokkos/algorithms/src/CMakeLists.txt +++ b/lib/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) -KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 7df12b8518..b28ea4c2ca 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template struct Random_UniqueIndex { using locks_view_type = View; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template -struct Random_UniqueIndex< - Kokkos::Device> { +struct Random_UniqueIndex> { using locks_view_type = - View>; + View>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -1121,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View; using int_view_type = View; - using state_data_type = View; + using state_data_type = View; locks_type locks_ = {}; state_data_type state_ = {}; diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572..8e7de32a07 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / + mul_(static_cast(max_bins) / (static_cast(max) - static_cast(min))), min_(static_cast(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral::value && (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { + static_cast(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast(max_bins[0]) / (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / + mul_[1] = static_cast(max_bins[1]) / (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / + mul_[2] = static_cast(max_bins[2]) / (static_cast(max[2]) - static_cast(min[2])); min_[0] = static_cast(min[0]); min_[1] = static_cast(min[1]); diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe4..f417b6b13b 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index 308e9e3a00..20026c77e4 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -53,9 +53,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort without comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -107,9 +111,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort with comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index f11f807048..2a8f761d9b 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -30,6 +30,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -76,13 +77,10 @@ namespace Kokkos::Impl { template constexpr inline bool is_admissible_to_kokkos_sort_by_key = - ::Kokkos::is_view::value&& T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value); + ::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v); template KOKKOS_INLINE_FUNCTION constexpr void @@ -144,7 +142,7 @@ void sort_by_key_rocthrust( #if defined(KOKKOS_ENABLE_ONEDPL) template -inline constexpr bool sort_on_device_v = +inline constexpr bool sort_on_device_v = std::is_same_v || std::is_same_v; @@ -152,7 +150,7 @@ inline constexpr bool sort_on_device_v = template void sort_by_key_onedpl( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { @@ -176,7 +174,7 @@ template void applyPermutation(const ExecutionSpace& space, const PermutationView& permutation, const ViewType& view) { - static_assert(std::is_integral::value); + static_assert(std::is_integral_v); auto view_copy = Kokkos::create_mirror( Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, @@ -335,7 +333,7 @@ void sort_by_key_device_view_without_comparator( template void sort_by_key_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values) { #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY @@ -392,7 +390,7 @@ void sort_by_key_device_view_with_comparator( template void sort_by_key_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, const ComparatorType& comparator) { diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 0894622891..734ce450f6 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -146,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { + if (std::is_integral_v) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast(result.max_val); auto const min_val = static_cast(result.min_val); @@ -157,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point::value) { + if (std::is_floating_point_v) { KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - static_cast(result.min_val))); } @@ -211,11 +212,11 @@ void sort_rocthrust(const HIP& space, #if defined(KOKKOS_ENABLE_ONEDPL) template -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -268,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -310,7 +321,7 @@ void sort_device_view_without_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& view) { using ViewType = Kokkos::View; static_assert( @@ -365,8 +376,7 @@ void sort_device_view_with_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View& view, + const Kokkos::SYCL& exec, const Kokkos::View& view, const ComparatorType& comparator) { using ViewType = Kokkos::View; static_assert( @@ -397,12 +407,12 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. - using ViewType = Kokkos::View; - using MemSpace = typename ViewType::memory_space; // Note with HIP unified memory this code path is still the right thing to do // if we end up here when RocThrust is not enabled. // The create_mirror_view_and_copy will do the right thing (no copy). -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb..ea7e55ca61 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f6..89585ddbea 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 54bb13e25b..da16141f5a 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -33,12 +33,12 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)>> + (std::is_same_v || + std::is_same_v || + std::is_same_v)>> : std::true_type {}; template @@ -102,8 +102,8 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator_v && std::is_base_of::value; + is_iterator_v && std::is_base_of_v; }; template @@ -165,9 +165,8 @@ struct iterators_have_matching_difference_type { template struct iterators_have_matching_difference_type { - static constexpr bool value = - std::is_same::value; + static constexpr bool value = std::is_same_v; }; template diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d46..dc910861d5 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5bce89e98f..e8c638c94c 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View > { using iterator_type = RandomAccessIterator; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -61,9 +61,9 @@ class RandomAccessIterator< ::Kokkos::View > { #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond template - requires(std::is_constructible_v) KOKKOS_FUNCTION - explicit(!std::is_convertible_v) - RandomAccessIterator(const RandomAccessIterator& other) + requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) : m_view(other.m_view), m_current_index(other.m_current_index) {} #else template < diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index b4046c7645..e6caa07288 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e39..7aa0e4fc44 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index db184bc8a9..31247af159 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,21 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortByKey - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -47,14 +35,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() @@ -65,11 +48,7 @@ endforeach() # std set A # ------------------------------------------ set(STDALGO_SOURCES_A) -foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) endforeach() @@ -77,10 +56,7 @@ endforeach() # std set B # ------------------------------------------ set(STDALGO_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) endforeach() @@ -88,22 +64,23 @@ endforeach() # std set C # ------------------------------------------ set(STDALGO_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) endforeach() @@ -111,27 +88,28 @@ endforeach() # std set D # ------------------------------------------ set(STDALGO_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) endforeach() @@ -139,20 +117,21 @@ endforeach() # std set E # ------------------------------------------ set(STDALGO_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() @@ -160,11 +139,7 @@ endforeach() # std team Q # ------------------------------------------ set(STDALGO_TEAM_SOURCES_Q) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) endforeach() @@ -172,11 +147,7 @@ endforeach() # std team P # ------------------------------------------ set(STDALGO_TEAM_SOURCES_P) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() @@ -184,14 +155,9 @@ endforeach() # std team M # ------------------------------------------ set(STDALGO_TEAM_SOURCES_M) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) endforeach() @@ -199,14 +165,9 @@ endforeach() # std team L # ------------------------------------------ set(STDALGO_TEAM_SOURCES_L) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) endforeach() @@ -214,13 +175,9 @@ endforeach() # std team I # ------------------------------------------ set(STDALGO_TEAM_SOURCES_I) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) endforeach() @@ -228,18 +185,19 @@ endforeach() # std team H # ------------------------------------------ set(STDALGO_TEAM_SOURCES_H) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() @@ -247,13 +205,9 @@ endforeach() # std team G # ------------------------------------------ set(STDALGO_TEAM_SOURCES_G) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) endforeach() @@ -261,13 +215,9 @@ endforeach() # std team F # ------------------------------------------ set(STDALGO_TEAM_SOURCES_F) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) endforeach() @@ -275,15 +225,16 @@ endforeach() # std team E # ------------------------------------------ set(STDALGO_TEAM_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) endforeach() @@ -291,12 +242,7 @@ endforeach() # std team D # ------------------------------------------ set(STDALGO_TEAM_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) endforeach() @@ -304,16 +250,17 @@ endforeach() # std team C # ------------------------------------------ set(STDALGO_TEAM_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) endforeach() @@ -321,13 +268,9 @@ endforeach() # std team B # ------------------------------------------ set(STDALGO_TEAM_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) endforeach() @@ -335,34 +278,33 @@ endforeach() # std team A # ------------------------------------------ set(STDALGO_TEAM_SOURCES_A) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e671..bb074f2480 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View keys_, + bin3d_is_sorted_struct(Kokkos::View keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; - sum3D(Kokkos::View keys_) : keys(keys_) {} + sum3D(Kokkos::View keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf31..d11b53a9a6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc..cd57fd23ec 100644 --- a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b..6960b912d0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 7d484136b6..5ab348cb19 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); @@ -264,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v); static_assert(KE::Impl::are_random_access_iterators_v); static_assert(!KE::Impl::are_iterators_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b..5ea88ae5d6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 9e5bd4a574..44abe4e73a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -83,8 +83,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } // Test #7036 @@ -95,8 +95,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } TEST(TEST_CATEGORY, SortByKey) { @@ -183,12 +183,12 @@ TEST(TEST_CATEGORY, SortByKeyStaticExtents) { Kokkos::View keys("keys"); Kokkos::View values_static("values_static"); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_static)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); Kokkos::View values_dynamic("values_dynamic", 10); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); } template @@ -234,7 +234,9 @@ TEST(TEST_CATEGORY, SortByKeyWithStrides) { ASSERT_EQ(sort_fails, 0u); } -TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6e..208b46b15f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbe..d8b80675c9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 67052e2f9d..dadce2d474 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -534,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -546,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View; @@ -566,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -583,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 2a4525a8c3..923ea970f9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,7 +81,7 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } -TEST(std_algorithms, expect_no_overlap) { +TEST(std_algorithms_DeathTest, expect_no_overlap) { namespace KE = Kokkos::Experimental; using value_type = double; @@ -104,6 +104,8 @@ TEST(std_algorithms, expect_no_overlap) { // Overlapping because iterators are identical #if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -148,8 +150,7 @@ TEST(std_algorithms, expect_no_overlap) { auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) - EXPECT_NO_THROW( - { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -160,9 +161,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); - }); + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -172,9 +171,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); - }); + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ @@ -185,9 +182,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); - }); + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be0..7c9e8f84bf 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index b364c53a88..a85e63fe34 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f..b24730ff00 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e3..2b3361743e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template struct IsEvenFunctor { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index a08a737210..b4f40b4651 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 75d4f0afeb..18928a3526 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -154,7 +155,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 29ac7cc9bc..8327bfe13c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c4..df5df756d2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 1b1a02f39c..6918185bc0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value); + static_assert(std::is_rvalue_reference_v); // move constr MyMovableType b(std::move(a)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e..42a17d7377 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1..88e2a68ff1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index a36c9db2b9..e47cacdd7d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,9 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value); + std::is_same_v); static_assert( - std::is_same::value); + std::is_same_v); const std::size_t ext = view_from.extent(0); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b..f897e9b657 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108b..3137880ea8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa923..d88ab5473d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234ee..e42788799e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d76..4596726cf3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e17..b18c859af5 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba88971..82f859bac1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5..5ae2ff4278 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca72783..3c934d6485 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a..bf5c2ee782 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea..1a860c58ce 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444..195f88a0b7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2e..79d88bec23 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216e..12835d5a2f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae94375..3e350cf3b3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9b..5a2c046939 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf0..071ecd5a9a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index 7c3c465dc8..3f83ac7404 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e..9b509af55b 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd..38df5c30ce 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View randomIndices( diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 7cb9851087..0c35c5e599 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e0..88c5e21f31 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569..d350bc62cd 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a3..70f2be77f6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e..873e8faf4c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131..265cdf4746 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor()); + GenerateFunctor()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index 850e80dde1..f76a595b3f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index e3b95527c7..5bc49e4600 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -77,8 +77,8 @@ struct TestFunctorA { else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator{}); + KE::cend(myRowView), + CustomLessThanComparator{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index 283525dbd1..452a48df21 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 8579b48315..2c79370b92 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 51010fdff5..25a4487855 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554..2c445dacf8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b..2defa1dc6f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 6bb0d24998..71a50e39e3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index cff9aa178a..d5b5304f63 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574..64f172e401 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index d0217aed7a..9c3699320d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -151,7 +151,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f6..51f600faba 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef0..08ff8fbbca 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa..60cb3f0837 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 0b0d798fd8..78a21c4430 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8..cef0f7c13d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator{}); + CustomEqualityComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator{}); + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 0d3289e196..89ea8154c7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator{}); + KE::begin(myRowDest), + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fa2804256a..365ca21688 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -161,7 +161,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index fb81ae91b0..cc87262147 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -173,7 +173,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a..6ee93e3d5f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db..e3e9696458 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp index c05006a161..0044b93558 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,7 +80,7 @@ auto create_host_view_with_reduction_order_indices( template auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; @@ -191,7 +191,7 @@ template void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml deleted file mode 100644 index d0a5645ef7..0000000000 --- a/lib/kokkos/appveyor.yml +++ /dev/null @@ -1,10 +0,0 @@ -image: - - Visual Studio 2019 -clone_folder: c:\projects\source -build_script: -- cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && - cmake --build . --target install && - ctest -C Debug --output-on-failure diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index 529ef393d9..968c8ae3bf 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,12 @@ #FIXME_OPENMPTARGET - compiling in debug mode causes ICE. -KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -IF(NOT Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) -ENDIF() +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/lib/kokkos/benchmarks/atomic/CMakeLists.txt b/lib/kokkos/benchmarks/atomic/CMakeLists.txt index 85f7412f49..7fda2bf6f6 100644 --- a/lib/kokkos/benchmarks/atomic/CMakeLists.txt +++ b/lib/kokkos/benchmarks/atomic/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - atomic - SOURCES main.cpp -) +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt index 0ce44a6f1a..9c65d06ce2 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt +++ b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -1,4 +1,9 @@ -KOKKOS_ADD_EXECUTABLE( +kokkos_add_executable( bytes_and_flops - SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp ) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 78cfd48eff..762cc988f1 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,9 +17,9 @@ template struct Run { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View A("A", N, K); - Kokkos::View B("B", N, K); - Kokkos::View C("C", N, K); + Kokkos::View A("A", N, K); + Kokkos::View B("B", N, K); + Kokkos::View C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); diff --git a/lib/kokkos/benchmarks/gather/CMakeLists.txt b/lib/kokkos/benchmarks/gather/CMakeLists.txt index 24c7062772..2de1ce85e6 100644 --- a/lib/kokkos/benchmarks/gather/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gather/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gather - SOURCES main.cpp -) +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/gups/CMakeLists.txt b/lib/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc6..dc70747029 100644 --- a/lib/kokkos/benchmarks/gups/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/lib/kokkos/benchmarks/gups/gups.cpp b/lib/kokkos/benchmarks/gups/gups.cpp index 369052321d..e00f87968b 100644 --- a/lib/kokkos/benchmarks/gups/gups.cpp +++ b/lib/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt index bb14da749d..4775bf2261 100644 --- a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt +++ b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - launch_latency - SOURCES launch_latency.cpp -) +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp index 73b176ab8d..156c29af09 100644 --- a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp +++ b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { else if (i == 3) K = atoi(arg.data()); else { - throw std::runtime_error("unexpected argument!"); + Kokkos::abort("unexpected argument!"); } } else if (arg == "--no-parallel-for") { opts.par_for = false; @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { } else { std::stringstream ss; ss << "unexpected argument \"" << arg << "\" at position " << i; - throw std::runtime_error(ss.str()); + Kokkos::abort(ss.str().c_str()); } } diff --git a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt index 929b9c9702..4a939775c0 100644 --- a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt +++ b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - policy_performance - SOURCES main.cpp -) +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/stream/CMakeLists.txt b/lib/kokkos/benchmarks/stream/CMakeLists.txt index 0dded6e3a5..b096976c48 100644 --- a/lib/kokkos/benchmarks/stream/CMakeLists.txt +++ b/lib/kokkos/benchmarks/stream/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - stream - SOURCES stream-kokkos.cpp -) +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt index 50a331b2b3..f7bbc13b6e 100644 --- a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt +++ b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - view_copy_constructor - SOURCES view_copy_constructor.cpp -) +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler index d1f8896f91..ee3c29e96d 100755 --- a/lib/kokkos/bin/kokkos_launch_compiler +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/lib/kokkos/cmake/Dependencies.cmake b/lib/kokkos/cmake/Dependencies.cmake index fb1e73b579..2f70c2f038 100644 --- a/lib/kokkos/cmake/Dependencies.cmake +++ b/lib/kokkos/cmake/Dependencies.cmake @@ -1,5 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 08f128f2d1..44f81bb8ce 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -24,7 +24,6 @@ #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 @@ -40,7 +39,10 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -80,6 +82,7 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -118,10 +121,11 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX90A #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 #cmakedefine KOKKOS_ARCH_AMD_GFX1103 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8eb..0000000000 --- a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/lib/kokkos/cmake/Modules/CudaToolkit.cmake b/lib/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c..b8ac2048b5 100644 --- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 445f4e93a5..3a6a826197 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,44 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") - MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") find_package(CUDAToolkit REQUIRED) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) -ELSE() + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) - IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) - ELSE() - SET(FOUND_CUDART FALSE) - ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() - IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) - ELSE() - SET(FOUND_CUDA_DRIVER FALSE) - ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) - IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - ENDIF() -ENDIF() + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9..e3c199b7c5 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5b..77ce8c71f7 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8..85ae0b8224 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454..ce428b0aee 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 603510c315..68de942a69 100644 --- a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main() @@ -13,37 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() + endif() # Export oneDPL as a Kokkos dependency - KOKKOS_EXPORT_CMAKE_TPL(oneDPL) -ENDIF() + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b..9673af0b9d 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake index dae7dc3c95..b4b905795d 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -6,10 +6,10 @@ # behavior of ROCm 5.7 and later for earlier version of ROCm we set # AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If # the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. -SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") -SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") -FIND_PACKAGE(rocthrust REQUIRED) -KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(rocthrust) +kokkos_export_cmake_tpl(rocthrust) diff --git a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f..280b8641da 100644 --- a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/lib/kokkos/cmake/README.md b/lib/kokkos/cmake/README.md index 385bbfcd5d..0548e89a90 100644 --- a/lib/kokkos/cmake/README.md +++ b/lib/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) diff --git a/lib/kokkos/cmake/build_env_info.cmake b/lib/kokkos/cmake/build_env_info.cmake index 0eeb637245..ac28b2d850 100644 --- a/lib/kokkos/cmake/build_env_info.cmake +++ b/lib/kokkos/cmake/build_env_info.cmake @@ -2,111 +2,108 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) @@ -114,9 +111,9 @@ FUNCTION(check_git_setup) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/lib/kokkos/cmake/compile_tests/amd_apu.cc similarity index 57% rename from lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp rename to lib/kokkos/cmake/compile_tests/amd_apu.cc index 3c599b95a6..a9c1edbd57 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ b/lib/kokkos/cmake/compile_tests/amd_apu.cc @@ -14,5 +14,25 @@ // //@HEADER -#include -#include +#include +#include + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/lib/kokkos/cmake/cray.cmake b/lib/kokkos/cmake/cray.cmake index 08912f5130..4ce5352bda 100644 --- a/lib/kokkos/cmake/cray.cmake +++ b/lib/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 5b6afd6151..49eaf883a4 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -17,24 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a..52d8368d04 100644 --- a/lib/kokkos/cmake/deps/HWLOC.cmake +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake index e879bff374..b811f85084 100644 --- a/lib/kokkos/cmake/deps/Pthread.cmake +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake index 6aef08e881..9006d0cb9e 100644 --- a/lib/kokkos/cmake/deps/quadmath.cmake +++ b/lib/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index a18d2ac518..d3fe1e6e2f 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -1,288 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() - else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() + endforeach() +endfunction() -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) - else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) - endif() -ENDFUNCTION() +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" ${ARGN} ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - - -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) + else() + set(EXE_ROOT ${TEST_NAME}) endif() -endforeach() -ENDMACRO() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() + # For compatibility with Trilinos testing, we support: + # * `-D _DISABLE=ON` + # * `-D _EXTRA_ARGS=";;;..."` + # * `-D _SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(GLOBAL_APPEND VARNAME) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) + else() + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) + endif() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D _ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() + +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) + else() + set(${RET} "PUBLIC" PARENT_SCOPE) + endif() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() + +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) + + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() + +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() + +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/lib/kokkos/cmake/gnu.cmake b/lib/kokkos/cmake/gnu.cmake index aa11fe87b1..e53b4a7bec 100644 --- a/lib/kokkos/cmake/gnu.cmake +++ b/lib/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/intel.cmake b/lib/kokkos/cmake/intel.cmake index 7e6ee3358c..b7752caabd 100644 --- a/lib/kokkos/cmake/intel.cmake +++ b/lib/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 0b3d4044d0..ae45da806f 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -1,611 +1,732 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) +set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) -KOKKOS_DEPRECATED_LIST(ARCH ARCH) +kokkos_deprecated_list(ARCH ARCH) -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") -DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- -KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") -KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() +endif() -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve - ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV9_GRACE) - SET(KOKKOS_ARCH_ARM_NEON ON) +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) - IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 - ) - ELSE() - MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") - ENDIF() -ENDIF() + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) -ENDIF() +endif() -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_RISCV_SG2042) - IF(NOT - (KOKKOS_CXX_COMPILER_ID STREQUAL GNU - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) - OR - (KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) - MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=rv64imafdcv - ) -ENDIF() + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause + ) +endif() -IF (KOKKOS_ARCH_KNL) +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm + ) +endif() + +if(KOKKOS_ARCH_KNL) #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic - ) -ENDIF() +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() +endif() -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() +endif() -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() # FIXME_NVCC nvcc doesn't seem to support Arm Neon. -IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - UNSET(KOKKOS_ARCH_ARM_NEON) -ENDIF() +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - ) - ENDIF() -ENDIF() +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT --hip-link - ) - ENDIF() - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables # FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device @@ -613,17 +734,18 @@ ENDIF() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -IF(KOKKOS_ENABLE_SYCL) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSEIF(NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " #include using namespace sycl::ext::oneapi::experimental; using namespace sycl; @@ -638,548 +760,617 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." ) - ENDIF() - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() - + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() -IF(KOKKOS_IMPL_AMDGPU_FLAGS) - IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " - "Please explicitly set the GPU architecture.") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") -ENDIF() +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} - ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda - ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda + ) + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." + ) + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa + ) + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." + ) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + ) + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" + ) + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") +message(STATUS "Built-in Execution Spaces:") -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() - -IF(KOKKOS_ENABLE_ATOMICS_BYPASS) - IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") - MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") - ENDIF() - IF(NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "Implementation bug") # safeguard - ENDIF() - MESSAGE(STATUS "Atomics: **DISABLED**") -ENDIF() +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/lib/kokkos/cmake/kokkos_check_env.cmake b/lib/kokkos/cmake/kokkos_check_env.cmake index a455a403b9..f1a309ff85 100644 --- a/lib/kokkos/cmake/kokkos_check_env.cmake +++ b/lib/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index e8bfadb64e..010ed33ede 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,262 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() -IF(Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -IF(NOT CMAKE_CXX_STANDARD) - SET(CMAKE_CXX_STANDARD 17) -ENDIF() -IF(CMAKE_CXX_STANDARD EQUAL 17) - SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 8.2.0) - SET(KOKKOS_INTEL_MINIMUM 19.0.5) - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 11.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.29) -ELSE() - SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 10.1.0) - SET(KOKKOS_INTEL_MINIMUM "not supported") - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 12.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.30) -ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ENDIF() + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) # On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() - +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/lib/kokkos/cmake/kokkos_configure_trilinos.cmake b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 0000000000..5aeef61e7b --- /dev/null +++ b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0ca..530e9e8fd8 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index c7d189285c..40c2d3ea8a 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,128 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) +kokkos_deprecated_list(DEVICES ENABLE) - -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") # Device backends have to come after host backends for header include order reasons # Without this we can't make e.g. CudaSpace accessible by HostSpace -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp ) -ENDIF() + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 53764b0c68..a5d6fdfe4e 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -1,198 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") -KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") -KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") -KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) -KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index d1f1e0d7a7..38eedd8362 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() -MACRO(kokkos_export_cmake_tpl NAME) +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() - IF (KOKKOS_ENABLE_${NAME}) + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) -MACRO(kokkos_import_cmake_tpl MODULE_NAME) +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,111 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, _ROOT/include and _ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib) - IF(KOKKOS_IMPL_32BIT) - LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) - ELSE() - LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) - ENDIF() - ENDIF() + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() - INCLUDE(FindPackageHandleStandardArgs) + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -817,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries( ) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() - IF (PARSE_COMPILE_OPTIONS) + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -927,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include #include @@ -985,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1005,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1066,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_install.cmake b/lib/kokkos/cmake/kokkos_install.cmake index f818dfa244..3ae7570ffe 100644 --- a/lib/kokkos/cmake/kokkos_install.cmake +++ b/lib/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index ae14a10d53..0d31e6d131 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,20 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -SET(KOKKOS_ENABLE_CXX26 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 5b45674e05..a84e714064 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,101 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") kokkos_set_cxx_standard_feature(26) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") - SET(KOKKOS_ENABLE_CXX26 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -105,66 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index cda9e0d600..f43aff4d1f 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -1,126 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() - -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) - SET(ROCTHRUST_DEFAULT ON) -ELSE() - SET(ROCTHRUST_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) - -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) - -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) - -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) - -#Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -KOKKOS_IMPORT_TPL(CUDA INTERFACE) -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBDL) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -KOKKOS_IMPORT_TPL(LIBQUADMATH) -KOKKOS_IMPORT_TPL(ROCTHRUST) - -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) - find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() - -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) - find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) - ENDIF() - IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) - ENDIF() - IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) + +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) + +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) + +#Make sure we use our local FindKokkosCuda.cmake +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) + +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) + find_package(desul REQUIRED COMPONENTS atomics) + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() + +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) + find_package(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) +endif() + +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 6da543a2c8..2fda803b11 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() +endfunction() -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $) - - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,157 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) - else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) endif() -ENDFUNCTION() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() - # We noticed problems with -fvisibility=hidden for inline static variables - # if Kokkos was built as shared library. - IF(BUILD_SHARED_LIBS) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) - ENDIF() -ENDFUNCTION() + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() + else() + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") + endif() + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -242,265 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) - - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() + endif() - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) endif() -ENDMACRO() +endfunction() -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() + endif() +endmacro() diff --git a/lib/kokkos/cmake/msvc.cmake b/lib/kokkos/cmake/msvc.cmake index 85421bdbaa..1de13585c7 100644 --- a/lib/kokkos/cmake/msvc.cmake +++ b/lib/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/pgi.cmake b/lib/kokkos/cmake/pgi.cmake index e98e849558..45f59dcd10 100644 --- a/lib/kokkos/cmake/pgi.cmake +++ b/lib/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d22534..52d8368d04 100644 --- a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d..f51bce5d64 100644 --- a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1..b449f45135 100644 --- a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/containers/CMakeLists.txt b/lib/kokkos/containers/CMakeLists.txt index 0857d7007b..8ee8bb41a2 100644 --- a/lib/kokkos/containers/CMakeLists.txt +++ b/lib/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85..8d4d605b08 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f..953b8bff6e 100644 --- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template void test_scatter_view(int m, int n) { - Kokkos::View original_view("original_view", - n); + Kokkos::View original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index b7d85ebf11..b386fbe675 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7..409260f021 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index a37a2bdceb..6a2e6f73a1 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -338,23 +353,21 @@ class DualView : public ViewTraits { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; // does the given device match the device of t_dev? template struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same::value - }; + enum : bool { value = std::is_same_v }; }; // does the given device match the device of t_host? template struct impl_device_matches_thost_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -362,7 +375,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -370,7 +383,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -378,8 +391,8 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -389,11 +402,6 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -410,56 +418,47 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same::value; - constexpr bool device_is_execspace = - std::is_same::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same::value; - constexpr bool device_is_t_host_device = - std::is_same::value; - constexpr bool device_is_t_dev_device = - std::is_same::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v) + return d_view; + else { + static_assert(std::is_same_v, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -475,27 +474,27 @@ class DualView : public ViewTraits { template static int get_device_side() { constexpr bool device_is_memspace = - std::is_same::value; + std::is_same_v; constexpr bool device_is_execspace = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_host_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_host_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_host_device = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_dev_device = - std::is_same::value; + std::is_same_v; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -627,9 +626,9 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}); } @@ -637,9 +636,9 @@ class DualView : public ViewTraits { template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}, exec); } @@ -669,18 +668,18 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}); } template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}, exec); } @@ -943,12 +942,21 @@ class DualView : public ViewTraits { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if constexpr (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { @@ -1062,9 +1070,22 @@ class DualView : public ViewTraits { } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property(arg_prop); @@ -1182,15 +1203,15 @@ class DualView : public ViewTraits { } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> extent(const iType& r) const { return d_view.extent(r); } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> extent_int(const iType& r) const { return static_cast(d_view.extent(r)); } diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 5f7fcaf69e..2f2f4433e7 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank::type*; +}; + +template +struct ViewDataTypeFromRank { + using type = T; +}; + +template +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( + DynRankView v, + std::enable_if_t::specialize, + void>>* = nullptr); + template struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value), Layout> + (std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value) && - std::is_integral::value, + (std::is_same_v || + std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value) && - std::is_integral::value, + (std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && - std::is_void::value && - (std::is_same::value || - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v && + std::is_void_v && + std::is_void_v && + (std::is_same_v || + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { is_assignable_layout = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view> : public std::true_type { template inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template -class DynRankView : public ViewTraits { - static_assert(!std::is_array::value && - !std::is_pointer::value, +class DynRankView : private View { + static_assert(!std::is_array_v && !std::is_pointer_v, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits { template friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits; using view_type = View; - using traits = ViewTraits; - private: - using map_type = - Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,113 +498,32 @@ class DynRankView : public ViewTraits { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same::value, + std::is_same_v, is_layout_right = - std::is_same::value, + std::is_same_v, - is_layout_stride = std::is_same::value, + is_layout_stride = + std::is_same_v, - is_default_map = std::is_void::value && + is_default_map = std::is_void_v && (is_layout_left || is_layout_right || is_layout_stride) }; @@ -570,476 +551,150 @@ class DynRankView : public ViewTraits { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor - - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; - } - - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; - } + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; //---------------------------------------- // Compatible view copy constructor and assignment // may assign unmanaged from managed. + // Make this conditionally explicit? template - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - } + KOKKOS_FUNCTION DynRankView(const DynRankView& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} template - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView& rhs) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); - m_rank = rhs.rank(); + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); + } + + public: // Copy/Assign View to DynRankView template - KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(View::rank()) { + KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); + } + + template + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); + m_rank = rhs.rank(); + return *this; + } +#else + template + KOKKOS_FUNCTION DynRankView(const View& rhs, size_t new_rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View& rhs) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View::rank(); return *this; } +#endif + + template + KOKKOS_FUNCTION DynRankView(const View& rhs) + : DynRankView(rhs, View::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( - const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout), - Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } - - // Wrappers - template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} + + template + explicit DynRankView( + const Kokkos::Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +809,20 @@ class DynRankView : public ViewTraits { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +830,38 @@ class DynRankView : public ViewTraits { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +872,38 @@ class DynRankView : public ViewTraits { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +918,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template -class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - template - struct apply { - static_assert(Kokkos::is_memory_traits::value); - - using traits_type = - Kokkos::ViewTraits; - - using type = Kokkos::View; - }; - - using dimension = typename SrcTraits::dimension; - - template - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView; - - template - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView const& src, - Args... args) { - using DstType = ViewMapping; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template using Subdynrankview = typename Kokkos::Impl::ViewMapping::ret_type; -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subdynrankview(const Kokkos::DynRankView& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } +template +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v ? 1 : 0); - using metafcn = - Kokkos::Impl::ViewMapping, Args...>; - - return metafcn::subview(src.rank(), src, args...); + using return_type = + DynRankView; + return static_cast( + DynRankView( + sub, new_rank)); } - -// Wrapper to allow subview function name -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subview(const Kokkos::DynRankView& src, Args... args) { - return subdynrankview(src, args...); +template +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +972,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1128,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( DynRankView v, - typename std::enable_if::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,7 +1143,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - auto layout = v.impl_map().layout(); + auto layout = v.DownCast().layout(); if constexpr (std::is_same_v || std::is_same_v || @@ -1691,43 +1181,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView a) { } // namespace Impl -template -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template inline void deep_copy( const ExecSpace& e, const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1738,8 +1201,8 @@ template inline void deep_copy( const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1750,8 +1213,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1759,8 +1222,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1773,15 +1236,13 @@ inline void deep_copy( template inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1826,15 +1287,13 @@ inline void deep_copy( template inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1894,7 +1353,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1909,26 +1368,6 @@ struct MirrorDRViewType { std::conditional_t; }; -template -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView; -}; - } // namespace Impl namespace Impl { @@ -1945,10 +1384,9 @@ inline auto create_mirror(const DynRankView& src, arg_prop, std::string(src.label()).append("_mirror")); if constexpr (Impl::ViewCtorProp::has_memory_space) { - using dst_type = typename Impl::MirrorDRVType< + using dst_type = typename Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - + P...>::dest_view_type; return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } else { @@ -1989,7 +1427,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(const Space&, const Kokkos::DynRankView& src) { +inline auto create_mirror(const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } @@ -1999,8 +1438,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src) { +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -2026,12 +1465,12 @@ inline auto create_mirror_view( [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename DynRankView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::choose_create_mirror(src, arg_prop); @@ -2102,7 +1541,7 @@ inline auto create_mirror_view( // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index a4b74e246e..caae3f791f 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void::value, + static_assert(std::is_void_v, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference::value + std::is_lvalue_reference_v }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -665,9 +665,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorDynamicViewType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -693,14 +693,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::DynamicView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { @@ -835,21 +835,17 @@ inline void deep_copy(const View& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template @@ -861,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -964,7 +956,7 @@ struct ViewCopy, // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 3adc70b190..cf23c25b86 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template ::value && - std::is_signed::value, - iType> = 0> +template && std::is_signed_v, + iType> = 0> using IndexRange = Kokkos::Array; using index_list_type = std::initializer_list; @@ -118,11 +118,11 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE( (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) @@ -180,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template -class OffsetView : public ViewTraits { - public: - using traits = ViewTraits; - +class OffsetView : public View { private: template friend class OffsetView; - template - friend class View; // FIXME delete this line - template - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array; template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -245,529 +241,60 @@ class OffsetView : public ViewTraits { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); + template + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping& implementation_map() const { - return m_map; - } - - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - if constexpr (is_layout_left) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - else - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } else if constexpr (is_layout_right) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - else - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined(KOKKOS_COMPILER_INTEL) - __builtin_unreachable(); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)); #endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; + template +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence(), + indices...); } - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } + template + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } - -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -778,20 +305,10 @@ class OffsetView : public ViewTraits { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template - KOKKOS_FUNCTION OffsetView(const View& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View& aview) : base_t(aview) { for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } @@ -800,19 +317,14 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -820,27 +332,13 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template KOKKOS_FUNCTION OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -879,7 +377,7 @@ class OffsetView : public ViewTraits { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -887,19 +385,19 @@ class OffsetView : public ViewTraits { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -941,7 +439,7 @@ class OffsetView : public ViewTraits { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -951,11 +449,11 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -993,20 +491,25 @@ class OffsetView : public ViewTraits { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1040,15 +543,6 @@ class OffsetView : public ViewTraits { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1070,18 +564,34 @@ class OffsetView : public ViewTraits { const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit OffsetView( @@ -1094,18 +604,34 @@ class OffsetView : public ViewTraits { const std::pair range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit KOKKOS_FUNCTION OffsetView( @@ -1113,9 +639,14 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1132,42 +663,9 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1177,7 +675,7 @@ class OffsetView : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1185,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { namespace Impl { template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1197,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, Kokkos::pair> + std::enable_if_t, Kokkos::pair> shift_input(const Kokkos::pair arg, const int64_t offset) { return Kokkos::make_pair(arg.first - offset, arg.second - offset); } template -inline std::enable_if_t::value, std::pair> -shift_input(const std::pair arg, const int64_t offset) { +inline std::enable_if_t, std::pair> shift_input( + const std::pair arg, const int64_t offset) { return std::make_pair(arg.first - offset, arg.second - offset); } @@ -1212,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array& subviewBegins, std::enable_if_t shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral::value) { + if (!std::is_integral_v) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1621,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits, Args...>::type>::type subview(const OffsetView& src, Args... args) { static_assert( - OffsetView::Rank == sizeof...(Args), + OffsetView::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1641,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1672,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1704,11 +1202,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1719,11 +1217,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1733,11 +1231,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const View& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1748,11 +1246,11 @@ template inline void deep_copy( const View& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1770,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1786,27 +1284,6 @@ struct MirrorOffsetViewType { std::conditional_t; }; -template -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView; -}; - } // namespace Impl namespace Impl { @@ -1825,10 +1302,12 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); } else { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); @@ -1877,9 +1356,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } @@ -1905,14 +1384,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::OffsetView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd..52af567c61 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution -struct DefaultDuplication { +struct DefaultDuplication { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess::array_layout, typename ViewTraits::device_type, Op, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, typename std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>>::type, diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac2..ec1b8905c7 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t::value, int>& = 0) + const std::enable_if_t, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index c3a8b67df8..4f47051a5c 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void::value, // !is_set + !std::is_void_v, // !is_set std::conditional_t> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -808,8 +808,8 @@ class UnorderedMap { // Re-allocate the views of the calling UnorderedMap according to src // capacity, and deep copy the src data. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> create_copy_view( UnorderedMap const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { @@ -821,8 +821,8 @@ class UnorderedMap { // Allocate views of the calling UnorderedMap with the same capacity as the // src. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> allocate_view( UnorderedMap const &src) { insertable_map_type tmp; @@ -852,8 +852,8 @@ class UnorderedMap { // Deep copy view data from src. This requires that the src capacity is // identical to the capacity of the calling UnorderedMap. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> deep_copy_view( UnorderedMap const &src) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba..83ccfbf630 100644 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a..6255a86c46 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include \n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include \n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp index 9923453f72..91dc1710e5 100644 --- a/lib/kokkos/containers/unit_tests/TestBitset.hpp +++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp @@ -39,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 2512cb5c49..5d03e6202a 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView; using DstViewType = - Kokkos::DualView; + Kokkos::DualView; SrcViewType a("A", n, m); @@ -520,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif -#ifdef UVM_ENABLED_BUILD -template -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor { - using type = Kokkos::HIPManagedSpace; -}; -#endif - -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor { - using type = typename UVMSpaceFor::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template +using TestSharedSpace = Kokkos::SharedSpace; #else -template -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor::type; +using MemSpace = TestSharedSpace; using DeviceType = Kokkos::Device; using DualViewType = Kokkos::DualView; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor::type>; +using ConstDualViewType = + Kokkos::DualView; +using d_device = DeviceType; +using h_device = + Kokkos::Device>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -635,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view()); - using dvt = decltype(dv.view()); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view(), + const_dv.view()); + ASSERT_EQ(dv.view(), + const_dv.view()); +} + +// User-defined types with a View data member, only host-constructible +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template +auto initialize_view_of_views() { + Kokkos::DualView dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views>>(); + + Kokkos::DualView dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 0000000000..95117a22e6 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// clang-format off +template +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type*; + using const_data_type = typename data_analysis::const_data_type*; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type*; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type[N]; + using const_data_type = typename data_analysis::const_data_type[N]; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type[N]; +}; + +template +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + static_assert(std::is_same_v); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device; + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v); + // FIXME: Not supported yet + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + + static_assert(std::is_same_v); + // FIXME: should be remove_const_t + static_assert(std::is_same_v); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template +struct ViewParams {}; + +template +constexpr bool test_view_typedefs(ViewParams) { + return test_view_typedefs_impl, Kokkos::ViewTraits, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View ...) + using host_mirror_space = std::conditional_t>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams>{})); +} + +// Kokkos::View> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs>>>( + ViewParams>{})); +} +// clang-format on +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 0000000000..e5f8860de7 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView; + using policy_type = Kokkos::TeamPolicy; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25c..930c76c32c 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8..94ccea86eb 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468..4ebab889c7 100644 --- a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter>(); } diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3d..706b40fff3 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122c..72c1afbe96 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same::value) { + if (!std::is_same_v) { #endif test_scatter_view_config::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); } } /* namespace TestStaticCrsGraph */ diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4a7e826ecb..fc7435a75e 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -460,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array values{ @@ -534,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -549,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddf..2edddcce34 100644 --- a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } diff --git a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index e8558628dc..2932898554 100644 --- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -44,6 +44,12 @@ Kokkos::CudaSpace>) \ GTEST_SKIP() << "skipping since unified memory requires additional " \ "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; #else #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE #endif @@ -51,8 +57,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 5, 6, 7, - 8); + Kokkos::DualView bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -82,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -112,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -245,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -280,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -312,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -343,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -384,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt index 0917928001..21f05f6272 100644 --- a/lib/kokkos/core/CMakeLists.txt +++ b/lib/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index e0dba03e1e..0cb2c804d3 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -1,50 +1,36 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -57,143 +43,93 @@ ELSE() list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) - -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71..1ebe750f21 100644 --- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View; + Kokkos::View; using elem_grad_type = - Kokkos::View; + Kokkos::View; elem_coord_type coords; elem_grad_type grad_op; diff --git a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a91..03340a5d6d 100644 --- a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include #include -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template std::pair custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction) ->UseManualTime(); } // namespace Test -#endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823..aa23ddbb60 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping(const Kokkos::SYCL&) { return true; } #endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e555..e4db40e128 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d915..57bba83a9c 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3ca..ab469cb647 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp index 9905740afb..bdfe59b0b3 100644 --- a/lib/kokkos/core/perf_test/test_mempool.cpp +++ b/lib/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/lib/kokkos/core/perf_test/test_sharedSpace.cpp b/lib/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409..3c06770e28 100644 --- a/lib/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/lib/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info(); #else #error \ diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp index fccaab64dd..347d9748b5 100644 --- a/lib/kokkos/core/perf_test/test_taskdag.cpp +++ b/lib/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index b84677e61b..72663739a1 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -1,118 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) - ENDIF() - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - IF(KOKKOS_ENABLE_OPENACC) - SET(DESUL_ATOMICS_ENABLE_OPENACC ON) - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -120,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() - -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -154,62 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index fd86976d3b..07c35e6611 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include // CUDA_SAFE_CALL #include -#include #include #include #include diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 6ae24022c8..8bcd6525c9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -201,7 +201,14 @@ void *impl_allocate_common(const int device_id, } } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); if (error_code == cudaSuccess) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index e1d062d72d..1ccf38a4a1 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -73,9 +73,9 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -174,9 +174,9 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; @@ -266,9 +266,9 @@ class CudaHostPinnedSpace { CudaHostPinnedSpace(int device_id, cudaStream_t stream); public: - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 5a821ab64a..058b1f538d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -51,7 +51,8 @@ class GraphNodeKernelImpl m_graph_node_ptr = nullptr; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -61,25 +62,20 @@ class GraphNodeKernelImpl - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -90,13 +86,21 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -128,7 +132,8 @@ struct get_graph_node_kernel_type // {{{1 template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = @@ -136,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 625d8c317a..8e800e756d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl { using node_details_t = GraphNodeBackendSpecificDetails; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl; @@ -74,11 +81,11 @@ struct GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -129,6 +136,8 @@ struct GraphImpl { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template @@ -158,13 +167,13 @@ struct GraphImpl { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -197,6 +206,9 @@ struct GraphImpl { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 158c8acdda..ec5768a7f0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include #include #include #include @@ -687,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -708,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b0dadb45f7..2d00e735cb 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const int cuda_device, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -491,7 +491,10 @@ struct CudaParallelLaunchKernelInvoker< cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -714,7 +717,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -728,7 +731,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 6303898400..c50ff43034 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -95,11 +95,39 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -108,13 +136,12 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -125,15 +152,16 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -143,14 +171,15 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -159,6 +188,7 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -166,7 +196,8 @@ class ParallelFor, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -174,6 +205,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 334834938a..8251fcb248 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -48,7 +48,7 @@ class ParallelFor, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 71e7751821..a2955e3ab6 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,9 +539,14 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -631,7 +636,7 @@ class ParallelReduce word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -895,11 +901,16 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 86d6d91bbe..5090e84c38 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -584,9 +587,9 @@ class TaskExec { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1224,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa78..aec692c2c3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum(result)); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - )) + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4cc..9e0c5819f7 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op { }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index 517c592af7..0ac2d4052d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,15 +23,12 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst) { +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp index aced2083ff..8de3a8758f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -27,6 +27,8 @@ #include +#include + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -49,34 +51,44 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } } - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; + } +#endif - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -146,10 +158,6 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY - os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; - os << "yes\n"; -#endif os << "\nRuntime Configuration:\n"; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e5..90e5cf7355 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 5f0df72df1..584cc63d95 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -44,22 +44,17 @@ class GraphNodeKernelImpl // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -73,18 +68,29 @@ class GraphNodeKernelImpl hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -114,13 +120,14 @@ struct get_graph_node_kernel_type Kokkos::ParallelReduceTag>> {}; template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = static_cast(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index a0989fe671..4f97214ca6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -42,11 +42,11 @@ class GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,18 +69,28 @@ class GraphImpl { template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; }; inline GraphImpl::~GraphImpl() { @@ -123,6 +133,8 @@ inline void GraphImpl::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has @@ -145,16 +157,15 @@ inline void GraphImpl::add_predecessor( hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index e0b25c6939..54e8c315e3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -97,6 +98,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << "undefined\n"; #endif + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); @@ -177,8 +185,16 @@ void HIPInternal::initialize(hipStream_t stream) { // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -353,14 +369,8 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -int HIPInternal::m_hipDev = -1; -unsigned HIPInternal::m_multiProcCount = 0; -unsigned HIPInternal::m_maxWarpCount = 0; -std::array HIPInternal::m_maxBlock = {0, 0, 0}; -unsigned HIPInternal::m_maxWavesPerCU = 0; -int HIPInternal::m_shmemPerSM = 0; -int HIPInternal::m_maxShmemPerBlock = 0; -int HIPInternal::m_maxThreadsPerSM = 0; +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; hipDeviceProp_t HIPInternal::m_deviceProp; @@ -372,15 +382,7 @@ std::mutex HIPInternal::constantMemMutex; //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} - -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} - -std::array hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 437a84253f..d8043dc23d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -31,7 +31,7 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ @@ -52,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -72,12 +70,6 @@ class HIPInternal { using size_type = ::Kokkos::HIP::size_type; static int m_hipDev; - static unsigned m_multiProcCount; - static unsigned m_maxWarpCount; - static std::array m_maxBlock; - static unsigned m_maxWavesPerCU; - static int m_shmemPerSM; - static int m_maxShmemPerBlock; static int m_maxThreadsPerSM; static hipDeviceProp_t m_deviceProp; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47..e243eb07e7 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include #include -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include #include #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker, HIP> { const Policy m_policy; public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { @@ -57,7 +57,7 @@ class ParallelFor, HIP> { inline void execute() const { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; if (Policy::rank == 2) { dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); dim3 const grid( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 9355c1c75f..3985dc60f0 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -53,8 +53,8 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index bf0c219338..83e890bce9 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -71,8 +71,8 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; __device__ inline void operator()() const { @@ -120,9 +120,14 @@ class ParallelFor, HIP> { m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -149,8 +154,9 @@ class ParallelFor, HIP> { static_cast(m_league_size)))); } - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 0c24e5cc62..fb4ff937cd 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -46,6 +46,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t, + Kokkos::HIP::size_type>; using reducer_type = ReducerType; using size_type = HIP::size_type; @@ -72,7 +88,7 @@ class ParallelReduce const - word_count(reducer.value_size() / sizeof(size_type)); + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(word_size_type)); - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = reducer.init(reinterpret_cast( + kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); + do_final_reduce = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -227,7 +244,8 @@ class ParallelReduce(m_scratch_space), result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { @@ -249,8 +267,9 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -306,11 +325,15 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -356,7 +379,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " "L0 scratch memory")); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index 83f829fdda..0b67921809 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,7 +23,7 @@ #include #include -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); #else diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index 1ca7bd5cd0..a464609108 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,7 +20,7 @@ #include #include -#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); #else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb0121..feee44ccaf 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 67635fc1c4..47f07b31ab 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -51,28 +51,54 @@ static std::atomic is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } + void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here @@ -80,6 +106,8 @@ void* HIPSpace::impl_allocate( Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -219,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index e1b4768b87..2380772cac 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,14 +58,14 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -77,15 +77,10 @@ class HIPSpace { return allocate(arg_label, arg_alloc_size, arg_logical_size); } #else - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; #endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, @@ -98,10 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -114,6 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device + hipStream_t m_stream; }; template <> @@ -140,9 +136,9 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; @@ -213,9 +209,9 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; @@ -280,7 +276,7 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; -#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) enum : bool{accessible = false}; #else enum : bool { accessible = true }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a72..1724b4361d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 67e1181125..f21c65f16d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -222,7 +222,8 @@ class TeamPolicyInternal m_tune_team_size(bool(team_size_request <= 0)), m_tune_vector_length(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on HIP execution " "space."); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b..f5b1d321e8 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( reinterpret_cast(in), lane_or_delta, width); } template __device__ inline std::enable_if_t - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( *reinterpret_cast(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp new file mode 100644 index 0000000000..34d5ecf1a6 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +namespace Kokkos { +namespace Impl { + +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast(dst)[i] = 0; }); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 4bca29868f..18708cf8c5 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,12 +23,21 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); + +template <> +struct ZeroMemset { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 245dc128ca..7d49933790 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -32,12 +32,10 @@ static_assert(false, #include #include #include -#include #include #include #include #include -#include #include #include @@ -75,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -140,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -196,7 +194,7 @@ class HPX { HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -214,9 +212,9 @@ class HPX { struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -249,13 +247,15 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); @@ -281,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -447,6 +447,20 @@ class HPX { } }; +template +std::vector partition_space(HPX const &, Args... args) { + std::vector instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template +std::vector partition_space(HPX const &, std::vector const &weights) { + std::vector instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased( bool, bool, std::function &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1772,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1810,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -1995,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515..d775b7fac3 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include +#include + #include #include @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 297b1fadee..92dc506c5e 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include #include +#include namespace Kokkos { @@ -60,13 +61,13 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T) { return false; } template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } @@ -75,7 +76,7 @@ is_less_than_value_initialized_variable(T arg) { template constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed::value != std::is_signed::value); + (std::is_signed_v != std::is_signed_v); auto const ret = static_cast(arg); if (static_cast(ret) != arg || (is_different_signedness && @@ -183,7 +184,7 @@ struct MDRangePolicy template friend struct MDRangePolicy; - static_assert(!std::is_void::value, + static_assert(!std::is_void_v, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -238,9 +239,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -257,9 +258,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -291,14 +292,14 @@ struct MDRangePolicy } template ::value>> + typename = std::enable_if_t>> MDRangePolicy(Kokkos::Array const& lower, Kokkos::Array const& upper, Kokkos::Array const& tile = Kokkos::Array{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template ::value>> + typename = std::enable_if_t>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array const& lower, Kokkos::Array const& upper, @@ -330,7 +331,44 @@ struct MDRangePolicy } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -341,6 +379,7 @@ struct MDRangePolicy rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; @@ -368,9 +407,7 @@ struct MDRangePolicy m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -389,58 +426,55 @@ struct MDRangePolicy }; template -MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy>; template MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], - const TT (&)[TN]) - ->MDRangePolicy>; + const TT (&)[TN]) -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template -MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; +MDRangePolicy(Array const&, Array const&) -> MDRangePolicy>; template MDRangePolicy(Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d6..62f527aa02 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index 4d905fbc55..493536b53b 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template ::value> +template > struct ArrayBoundsCheck; template @@ -195,8 +195,10 @@ struct Array { return *reinterpret_cast(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, Array const&) noexcept { @@ -365,7 +367,7 @@ struct KOKKOS_DEPRECATED #endif template -Array(T, Us...)->Array; +Array(T, Us...) -> Array; namespace Impl { @@ -377,7 +379,7 @@ KOKKOS_FUNCTION constexpr Array, N> to_array_impl( template KOKKOS_FUNCTION constexpr Array, N> to_array_impl( - T(&&a)[N], std::index_sequence) { + T (&&a)[N], std::index_sequence) { return {{std::move(a[I])...}}; } @@ -389,7 +391,7 @@ KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { } template -KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); } @@ -435,6 +437,32 @@ KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { } // namespace Kokkos // +// +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr T const* begin(Array const& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T* begin(Array& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T const* end(Array const& a) noexcept { + return a.data() + a.size(); +} + +template +KOKKOS_FUNCTION constexpr T* end(Array& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f274..ba61136092 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include #include -#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index bf57dcae65..0000000000 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,196 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include -#include - -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 26db69ac1f..40f51c5a33 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -24,14 +24,16 @@ static_assert(false, #include #include +#include // identity_type #include -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -53,197 +55,120 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { atomic_store(dest,val); } - -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} - -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } - -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t expected, desul::Impl::dont_deduce_this_parameter_t desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t compare, desul::Impl::dont_deduce_this_parameter_t desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - namespace Impl { - template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } +template +using not_deduced_atomic_t = + std::add_const_t>>; + +template +using enable_if_atomic_t = + std::enable_if_t && !std::is_const_v, + std::remove_volatile_t>; } // namespace Impl +// clang-format off + +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } + +// load/store +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } +#endif + +// atomic_fetch_op +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } + +// atomic_op_fetch +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_add_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_sub_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_max_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_min_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mul_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_div_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mod_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_and_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_or_fetch (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_xor_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_lshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_rshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } + +// atomic_op +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif + +// exchange +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif + +// clang-format on } // namespace Kokkos +namespace Kokkos::Impl { + +template +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); +} + +template +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); +} + +template +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} + +} // namespace Kokkos::Impl + #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 7dd2a9ddbb..8233c30b24 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -70,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -265,9 +264,8 @@ class #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -296,7 +294,7 @@ class // vl = r; // vl = cr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -319,7 +317,7 @@ class // vl = vr; // vl = cvr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -341,7 +339,7 @@ class // l = cvr; // template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -539,7 +537,7 @@ inline bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -551,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -590,7 +588,7 @@ inline bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -602,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -778,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex pow(const complex& x, return x == T() ? T() : exp(y * log(x)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow( const T& x, const complex& y) { using type = Impl::promote_2_t; return pow(type(x), complex(y)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, const U& y) { using type = Impl::promote_2_t; diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a0..0bfb9eb5fa 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template struct Schedule { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(std::is_same_v || std::is_same_v, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template struct IndexType { - static_assert(std::is_integral::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of, T>::value || \ - std::is_base_of, T>::value; \ + std::is_base_of_v, T> || \ + std::is_base_of_v, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same::value || - std::is_same::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same::value }; + enum { assignable = std::is_same_v }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same::value || + std::is_same_v || !exe_access::accessible, AccessSpace, Kokkos::Device>; diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index e856b19247..7da59aa4e4 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -561,21 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -649,21 +648,20 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1350,22 +1348,20 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { - using ValueType = typename ViewType::value_type; - alignas(alignof(ValueType)) unsigned char - zero_initialized_storage[sizeof(ValueType)] = {}; - contiguous_fill(exec_space, dst, - *reinterpret_cast(zero_initialized_storage)); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View( + static_cast(dst), cnt), + std::byte{}); } }; template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1375,20 +1371,20 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset>(exec_space, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits::value_type)); else contiguous_fill(exec_space, dst, value); } template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1397,9 +1393,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1411,11 +1405,12 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset(exec, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif contiguous_fill(exec, dst, value); @@ -1423,9 +1418,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1441,8 +1434,8 @@ template inline void deep_copy( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; @@ -1464,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1482,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1539,8 +1531,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; @@ -1576,8 +1568,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using dst_type = View; @@ -1587,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1628,8 +1620,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; @@ -1641,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1772,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2191,8 +2183,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2201,8 +2193,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2568,13 +2560,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2594,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2649,13 +2640,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && !Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2696,8 +2687,8 @@ inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, std::enable_if_t::value && - std::is_same::specialize, - void>::value>* = nullptr) { + std::is_same_v::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2734,8 +2725,8 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits; @@ -2743,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2784,15 +2775,15 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; using src_type = View; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2922,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2994,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template -inline typename std::enable_if< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3048,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3066,10 +3057,10 @@ resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3085,10 +3076,10 @@ template inline std::enable_if_t< (Impl::is_view_ctor_property::value || Kokkos::is_execution_space::value) && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3103,12 +3094,12 @@ resize(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3149,12 +3140,12 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3218,10 +3209,10 @@ inline void resize(Kokkos::View& v, /** \brief Resize a view with discarding old data. */ template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3264,10 +3255,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3283,10 +3274,10 @@ realloc(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3302,10 +3293,10 @@ realloc(Kokkos::View& v, template inline std::enable_if_t< Impl::is_view_ctor_property::value && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3320,12 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3365,12 +3356,12 @@ impl_realloc(Kokkos::View& v, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3435,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3450,26 +3441,6 @@ struct MirrorViewType { std::conditional_t; }; -template -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View; -}; - // collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { @@ -3503,7 +3474,7 @@ inline auto create_mirror(const Kokkos::View& src, if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; using dst_type = - typename Impl::MirrorType::view_type; + typename Impl::MirrorViewType::dest_view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3636,12 +3607,12 @@ inline auto create_mirror_view( const Kokkos::View& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::View< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>) { check_view_ctor_args_create_mirror(); return typename Kokkos::View::HostMirror(src); } else { @@ -3785,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View& src, std::string const& name = "", std::enable_if_t< - std::is_void::specialize>::value>* = - nullptr) { + std::is_void_v::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 1f146563be..9588d289a9 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -63,7 +63,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include @@ -248,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -281,7 +283,7 @@ std::vector partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector instances(weights.size()); diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7edb35f00e..5dbe571429 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -106,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -122,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -162,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -184,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index 92931b5849..69223b6412 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42..8af10b2a40 100644 --- a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected::value; template class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact::value; + is_detected_exact::value; // NOLINT template class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index b8d7f77deb..dd7ce5ce21 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,10 @@ static_assert(false, #include #include #include +#include +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include +#endif #include //---------------------------------------------------------------------------- @@ -197,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if (std::is_same_v) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -248,46 +250,49 @@ class RangePolicy : public Impl::PolicyTraits { // To be replaced with std::in_range (c++20) template - static void check_conversion_safety(const IndexType bound) { + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v) { #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - std::string msg = - "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " - "is performed on a bound (" + - std::to_string(bound) + - "), which may " - "not preserve its original value.\n"; - bool warn = false; + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; - if constexpr (std::is_signed_v != - std::is_signed_v) { - // check signed to unsigned - if constexpr (std::is_signed_v) - warn |= (bound < static_cast( - std::numeric_limits::min())); + if constexpr (std::is_arithmetic_v && + (std::is_signed_v != + std::is_signed_v)) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); - // check unsigned to signed - if constexpr (std::is_signed_v) - warn |= (bound > static_cast( - std::numeric_limits::max())); - } + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } - // check narrowing - warn |= (static_cast(static_cast(bound)) != bound); + // check narrowing + warn |= + (static_cast(static_cast(bound)) != bound); - if (warn) { + if (warn) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Kokkos::abort(msg.c_str()); + Kokkos::abort(msg.c_str()); #endif #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - Kokkos::Impl::log_warning(msg); + Kokkos::Impl::log_warning(msg); +#endif + } #endif } -#else - (void)bound; -#endif } public: @@ -333,20 +338,20 @@ class RangePolicy : public Impl::PolicyTraits { }; }; -RangePolicy()->RangePolicy<>; +RangePolicy() -> RangePolicy<>; -RangePolicy(int64_t, int64_t)->RangePolicy<>; -RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; -RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) - ->RangePolicy<>; + -> RangePolicy<>; template >> -RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy; template >> -RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy; } // namespace Kokkos @@ -515,24 +520,24 @@ struct PerThreadValue { template struct ExtractVectorLength { static inline iType value( - std::enable_if_t::value, iType> val, Args...) { + std::enable_if_t, iType> val, Args...) { return val; } - static inline std::enable_if_t::value, int> value( - std::enable_if_t::value, iType>, Args...) { + static inline std::enable_if_t, int> value( + std::enable_if_t, iType>, Args...) { return 1; } }; template -inline std::enable_if_t::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t, iType> extract_vector_length( + iType val, Args...) { return val; } template -inline std::enable_if_t::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t, int> extract_vector_length( + iType, Args...) { return 1; } @@ -577,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -721,55 +726,54 @@ class TeamPolicy // Execution space not provided deduces to TeamPolicy<> -TeamPolicy()->TeamPolicy<>; +TeamPolicy() -> TeamPolicy<>; -TeamPolicy(int, int)->TeamPolicy<>; -TeamPolicy(int, int, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; // DefaultExecutionSpace deduces to TeamPolicy<> -TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; -TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, - Kokkos::AUTO_t const&) - ->TeamPolicy<>; + Kokkos::AUTO_t const&) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; // ES != DefaultExecutionSpace deduces to TeamPolicy template >> -TeamPolicy(ES const&, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy; template >> TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) - ->TeamPolicy; + -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy; namespace Impl { @@ -1041,7 +1045,7 @@ struct TeamThreadMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange, TeamHandle>; + -> TeamThreadMDRange, TeamHandle>; template struct ThreadVectorMDRange; @@ -1078,7 +1082,7 @@ struct ThreadVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange, TeamHandle>; + -> ThreadVectorMDRange, TeamHandle>; template struct TeamVectorMDRange; @@ -1115,7 +1119,7 @@ struct TeamVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange, TeamHandle>; + -> TeamVectorMDRange, TeamHandle>; template @@ -1162,7 +1166,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1198,7 +1202,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1217,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template ::value> + bool HasTag = !std::is_void_v> struct ParallelConstructName; template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo>::name()) + + "/" + std::string(TypeInfo::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1239,7 +1249,11 @@ template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/lib/kokkos/core/src/Kokkos_Extents.hpp b/lib/kokkos/core/src/Kokkos_Extents.hpp index 9bc2eda604..7d1f8c755d 100644 --- a/lib/kokkos/core/src/Kokkos_Extents.hpp +++ b/lib/kokkos/core/src/Kokkos_Extents.hpp @@ -134,7 +134,7 @@ struct ApplyExtent { template struct ApplyExtent { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template diff --git a/lib/kokkos/core/src/Kokkos_Future.hpp b/lib/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8..c26d08be1c 100644 --- a/lib/kokkos/core/src/Kokkos_Future.hpp +++ b/lib/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template -class BasicFuture> { +class KOKKOS_DEPRECATED + BasicFuture> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture> { //////////////////////////////////////////////////////////////////////////////// template -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template -struct is_future, ExecSpace> +struct KOKKOS_DEPRECATED is_future, ExecSpace> : std::bool_constant< - std::is_same::value || - std::is_void::value> {}; + std::is_same_v || + std::is_void_v> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space::value }; enum { Arg2_is_space = Kokkos::is_space::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template -using Future = typename Impl::ResolveFutureArgOrder::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Graph.hpp b/lib/kokkos/core/src/Kokkos_Graph.hpp index 9cc6650e26..05d774ac61 100644 --- a/lib/kokkos/core/src/Kokkos_Graph.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // end Graph }}}1 @@ -135,22 +146,68 @@ Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template +std::enable_if_t, + Graph> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v>, + Graph> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // end create_graph }}}1 //============================================================================== +template +decltype(auto) Graph::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template +decltype(auto) Graph::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,7 +220,7 @@ Graph create_graph(Closure&& arg_closure) { #include #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_GraphNode.hpp b/lib/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf641..a0a60c07d0 100644 --- a/lib/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/lib/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same::value || + std::is_same_v || Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same::value || + static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template > && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor return this->then_parallel_for(std::move(name), Kokkos::RangePolicy(0, n), - (Functor &&) functor); + (Functor&&)functor); } template auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 8b5f29f95b..706586826f 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -63,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -183,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { @@ -202,10 +190,15 @@ struct DeepCopy { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index 37b80e54a8..a760e7054a 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 0a0acd303f..97b78a3c64 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -132,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -252,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -273,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -298,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -357,6 +357,21 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION +#endif + //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -369,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -537,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -582,9 +604,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ - _Pragma("diag_suppress 1216") + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ - _Pragma("diag_default 1216") + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ @@ -607,6 +631,18 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif // clang-format on #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152f..f7e9e2a78c 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05..1304d3ba92 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper { static constexpr long dou #endif template struct digits_helper {}; template <> struct digits_helper { static constexpr int value = 1; }; -template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed::value; }; +template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed_v; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index e569fefc14..c44d1f2310 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -449,7 +449,8 @@ struct KOKKOS_DEPRECATED pair { // Specialization of relational operators for Kokkos::pair. // -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #endif template @@ -487,7 +488,8 @@ KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index 122239df79..24349e95ae 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< @@ -348,9 +350,11 @@ template ::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 53913266f1..3b89d184f2 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -73,7 +73,7 @@ struct Sum { template KOKKOS_DEDUCTION_GUIDE Sum(View const&) - ->Sum::memory_space>; + -> Sum::memory_space>; template struct Prod { @@ -118,7 +118,7 @@ struct Prod { template KOKKOS_DEDUCTION_GUIDE Prod(View const&) - ->Prod::memory_space>; + -> Prod::memory_space>; template struct Min { @@ -165,7 +165,7 @@ struct Min { template KOKKOS_DEDUCTION_GUIDE Min(View const&) - ->Min::memory_space>; + -> Min::memory_space>; template struct Max { @@ -213,7 +213,7 @@ struct Max { template KOKKOS_DEDUCTION_GUIDE Max(View const&) - ->Max::memory_space>; + -> Max::memory_space>; template struct LAnd { @@ -259,7 +259,7 @@ struct LAnd { template KOKKOS_DEDUCTION_GUIDE LAnd(View const&) - ->LAnd::memory_space>; + -> LAnd::memory_space>; template struct LOr { @@ -306,7 +306,7 @@ struct LOr { template KOKKOS_DEDUCTION_GUIDE LOr(View const&) - ->LOr::memory_space>; + -> LOr::memory_space>; template struct BAnd { @@ -353,7 +353,7 @@ struct BAnd { template KOKKOS_DEDUCTION_GUIDE BAnd(View const&) - ->BAnd::memory_space>; + -> BAnd::memory_space>; template struct BOr { @@ -400,7 +400,7 @@ struct BOr { template KOKKOS_DEDUCTION_GUIDE BOr(View const&) - ->BOr::memory_space>; + -> BOr::memory_space>; template struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,11 +463,10 @@ struct MinLoc { }; template -KOKKOS_DEDUCTION_GUIDE MinLoc( - View, Properties...> const&) - ->MinLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MaxLoc { @@ -494,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -514,11 +523,10 @@ struct MaxLoc { }; template -KOKKOS_DEDUCTION_GUIDE MaxLoc( - View, Properties...> const&) - ->MaxLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MinMaxScalar { @@ -580,8 +588,8 @@ struct MinMax { template KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) - ->MinMax, Properties...>::memory_space>; + -> MinMax, Properties...>::memory_space>; template struct MinMaxLocScalar { @@ -622,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -650,9 +664,9 @@ struct MinMaxLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxLoc( View, Properties...> const&) - ->MinMaxLoc, - Properties...>::memory_space>; + -> MinMaxLoc, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -718,9 +732,9 @@ struct MaxFirstLoc { template KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( View, Properties...> const&) - ->MaxFirstLoc, - Properties...>::memory_space>; + -> MaxFirstLoc, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -788,9 +802,9 @@ template KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator, + Properties...>::memory_space>; // // MinFirstLoc @@ -852,9 +866,9 @@ struct MinFirstLoc { template KOKKOS_DEDUCTION_GUIDE MinFirstLoc( View, Properties...> const&) - ->MinFirstLoc, - Properties...>::memory_space>; + -> MinFirstLoc, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -922,9 +936,9 @@ template KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -997,9 +1011,9 @@ struct MinMaxFirstLastLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( View, Properties...> const&) - ->MinMaxFirstLastLoc, - Properties...>::memory_space>; + -> MinMaxFirstLastLoc, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1077,7 +1091,7 @@ template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View, Properties...>::memory_space>; @@ -1139,10 +1153,9 @@ struct FirstLoc { }; template -KOKKOS_DEDUCTION_GUIDE FirstLoc( - View, Properties...> const&) - ->FirstLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View, Properties...> const&) -> FirstLoc< + Index, typename View, Properties...>::memory_space>; // // LastLoc @@ -1202,8 +1215,8 @@ struct LastLoc { template KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) - ->LastLoc, Properties...>::memory_space>; + -> LastLoc, + Properties...>::memory_space>; template struct StdIsPartScalar { @@ -1270,8 +1283,8 @@ struct StdIsPartitioned { template KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( View, Properties...> const&) - ->StdIsPartitioned, - Properties...>::memory_space>; + -> StdIsPartitioned, + Properties...>::memory_space>; template struct StdPartPointScalar { @@ -1333,8 +1346,8 @@ struct StdPartitionPoint { template KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( View, Properties...> const&) - ->StdPartitionPoint, - Properties...>::memory_space>; + -> StdPartitionPoint, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1404,9 +1417,9 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< std::enable_if_t::value && - (!std::is_array::value && - !std::is_pointer::value) && - !Kokkos::is_reducer::value>, + (!std::is_array_v && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer::value>, ReturnType, FunctorType> { using return_type = Kokkos::View; @@ -1422,8 +1435,8 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array::value || - std::is_pointer::value)>, + std::enable_if_t<(std::is_array_v || + std::is_pointer_v)>, ReturnType, FunctorType> { using return_type = Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1434,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array::value) + if (std::is_array_v) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1467,8 +1480,7 @@ struct ParallelReducePolicyType< template struct ParallelReducePolicyType< - std::enable_if_t::value>, PolicyType, - FunctorType> { + std::enable_if_t>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; @@ -1501,27 +1513,28 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c::value, + Kokkos::Impl::if_c, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis; - using CombinedFunctorReducerType = CombinedFunctorReducer; + + CombinedFunctorReducerType functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce::execution_space>>( - CombinedFunctorReducerType( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))), - inner_policy, + functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); closure.execute(); @@ -1536,7 +1549,7 @@ struct ParallelReduceAdaptor { template static inline std::enable_if_t::value)> + std::is_pointer_v)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1568,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same(nullptr))>::value + value = std::is_same_v(nullptr))> }; }; @@ -1611,7 +1624,7 @@ struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1663,11 +1676,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1684,11 +1697,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1704,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1728,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1754,7 +1767,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1771,7 +1784,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1787,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1806,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index e7a9ba0c7e..1759c2b4a1 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -32,7 +32,7 @@ class [[nodiscard]] ProfilingSection { uint32_t sectionID; public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e..a4168b9401 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a33..f00e25fdb6 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec2..3edecb4502 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> host_spawn(Impl::TaskPolicyWithScheduler arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler */ template -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor> * 2) High, Normal, or Low priority */ template -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future::value || Kokkos::is_scheduler::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template -inline void wait(BasicTaskScheduler const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler const& scheduler) { using scheduler_type = BasicTaskScheduler; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf..83e1c06db9 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, Impl::TaskQueueTraitsLockBased>>; template -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -void wait(BasicTaskScheduler const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/lib/kokkos/core/src/Kokkos_Timer.hpp b/lib/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff18..ab31484d76 100644 --- a/lib/kokkos/core/src/Kokkos_Timer.hpp +++ b/lib/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp index f5ffc66af5..fcb061b378 100644 --- a/lib/kokkos/core/src/Kokkos_Tuners.hpp +++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -412,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy& policy, + const Kokkos::TeamPolicy& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy; + using PolicyType = Kokkos::TeamPolicy; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -505,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { } template - void tune(Kokkos::TeamPolicy& policy) { + auto tune(const Kokkos::TeamPolicy& policy_in) { + Kokkos::TeamPolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -515,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template +struct tuning_type_for; + +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast(lower), static_cast(upper), + static_cast(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template + auto tune(const Kokkos::RangePolicy& policy_in) { + Kokkos::RangePolicy policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -578,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin> { policy.impl_change_tile_size({std::get(tuple)...}); } template - void tune(Kokkos::MDRangePolicy& policy) { + auto tune(const Kokkos::MDRangePolicy& policy_in) { + Kokkos::MDRangePolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/lib/kokkos/core/src/Kokkos_TypeInfo.hpp b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 0000000000..e5710da2e3 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include +#include +#include + +#include + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template +constexpr std::array to_array(std::string_view src) { + std::array dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array(func.substr(beg, end)); +} + +template +class TypeInfo { + static constexpr auto value_ = type_name(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 04d1fcf151..d5b352876c 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -22,2016 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include -#include -#include -#endif -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewArrayAnalysis; - -template ::non_const_value_type> -struct ViewDataAnalysis; - -template -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -// FIXME Ideally, we would not instantiate this function for every possible View -// type. We should be able to only pass "extent" when we use mdspan. -template -KOKKOS_INLINE_FUNCTION void runtime_check_rank( - const View&, const bool is_void_spec, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t i7, const char* label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - // We either allow to pass as many extents as the dynamic rank is, or - // as many extents as the total rank is. In the latter case, the given - // extents for the static dimensions must match the - // compile-time extents. - constexpr int rank = View::rank(); - constexpr int dyn_rank = View::rank_dynamic(); - const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; - const bool n_args_is_rank = num_passed_args == rank; - - if constexpr (rank != dyn_rank) { - if (n_args_is_rank) { - size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - for (int i = dyn_rank; i < rank; ++i) - if (new_extents[i] != View::static_extent(i)) { - KOKKOS_IF_ON_HOST( - const std::string message = - "The specified run-time extent for Kokkos::View '" + - std::string(label) + - "' does not match the compile-time extent in dimension " + - std::to_string(i) + ". The given extent is " + - std::to_string(new_extents[i]) + " but should be " + - std::to_string(View::static_extent(i)) + ".\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "The specified run-time extents for a Kokkos::View " - "do not match the compile-time extents.");) - } - } - } - - if (!n_args_is_dyn_rank && !n_args_is_rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos::View '" + std::string(label) + - "' has mismatched number of arguments. The number " - "of arguments = " + - std::to_string(num_passed_args) + - " neither matches the dynamic rank = " + - std::to_string(dyn_rank) + - " nor the total rank = " + std::to_string(rank) + "\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template -struct ViewTraits; - -template <> -struct ViewTraits { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = HooksPolicy; -}; - -template -struct ViewTraits::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::HostMirrorSpace, - void>::value && - std::is_same::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::array_layout, - void>::value && - std::is_same::memory_traits, - void>::value && - std::is_same::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits; - - using ExecutionSpace = - std::conditional_t::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror::Space>; - - using MemoryTraits = - std::conditional_t::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -namespace Impl { -struct UnsupportedKokkosArrayLayout; - -template -struct MDSpanViewTraits { - using mdspan_type = UnsupportedKokkosArrayLayout; -}; - -// "Natural" mdspan for a view if the View's ArrayLayout is supported. -template -struct MDSpanViewTraits::type>> { - using index_type = std::size_t; - using extents_type = - typename Impl::ExtentsFromDataType::type; - using mdspan_layout_type = - typename Impl::LayoutFromArrayLayout::type; - using accessor_type = Impl::SpaceAwareAccessor< - typename Traits::memory_space, - Kokkos::default_accessor>; - using mdspan_type = mdspan; -}; -} // namespace Impl -#endif // KOKKOS_ENABLE_IMPL_MDSPAN - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, double* - * indicates a one-dimensional array of \c double with run-time - * dimension, and int*[3] a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * Space. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View out, - * View in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template -struct is_always_assignable_impl; - -template -struct is_always_assignable_impl, - Kokkos::View> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View::traits, - typename Kokkos::View::traits, - typename Kokkos::View::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast(Kokkos::View::rank_dynamic) >= - static_cast(Kokkos::View::rank_dynamic); -}; - -template -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t, - std::remove_const_t>>; - -template -inline constexpr bool is_always_assignable_v = - is_always_assignable::value; - -template -constexpr bool is_assignable(const Kokkos::View& dst, - const Kokkos::View& src) { - using DstTraits = typename Kokkos::View::traits; - using SrcTraits = typename Kokkos::View::traits; - using mapping_type = - Kokkos::Impl::ViewMapping; - - return is_always_assignable_v, - Kokkos::View> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include #endif -inline constexpr Kokkos::ALL_t ALL{}; +#include -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template -inline Impl::ViewCtorProp::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -class View; - -template -struct is_view : public std::false_type {}; - -template -struct is_view> : public std::true_type {}; - -template -struct is_view> : public std::true_type {}; - -template -inline constexpr bool is_view_v = is_view::value; - -template -class View : public ViewTraits { - private: - template - friend class View; - template - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker; - - public: - using traits = ViewTraits; - - private: - using map_type = - Kokkos::Impl::ViewMapping; - template - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View; - - /** \brief Compatible view of const data type */ - using const_type = - View; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType::type; - using uniform_const_type = - typename Impl::ViewUniformType::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant - rank = {}; - static constexpr Impl::integral_constant - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const noexcept { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - -#endif - - template - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is)); - static_assert(sizeof...(Is) <= 8); - static_assert(Kokkos::Impl::are_integral::value); - } - - template - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is)); - static_assert(Kokkos::Impl::are_integral::value); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View( - const View& rhs, - std::enable_if_t::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View& rhs) { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View; - - using Mapping = Kokkos::Impl::ViewMapping; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, alloc_name.c_str()); - } -#endif - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, "UNMANAGED"); - } -#endif - } - - // Simple dimension-only layout - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(reinterpret_cast( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - //---------------------------------------- - // MDSpan converting constructors -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN - template ::mdspan_type> - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(traits::is_managed) -#endif - View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, - std::enable_if_t< - !std::is_same_v>* = - nullptr) - : View(mds.data_handle(), - Impl::array_layout_from_mapping< - typename traits::array_layout, - typename Impl::MDSpanViewTraits::mdspan_type>( - mds.mapping())) { - } - - template - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(!std::is_convertible_v< - Kokkos::mdspan, - typename Impl::MDSpanViewTraits::mdspan_type>) -#endif - View(const Kokkos::mdspan& mds) - : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { - } - - //---------------------------------------- - // Conversion to MDSpan - template ::mdspan_type, - typename = std::enable_if_t, - std::false_type, - std::is_assignable, - ImplNaturalMDSpanType>>::value>> - KOKKOS_INLINE_FUNCTION constexpr operator mdspan< - OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - return mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map)}; - } - - template >, - typename = std::enable_if_t>> - KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( - const OtherAccessorType& other_accessor = - typename Impl::MDSpanViewTraits::accessor_type()) { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - using ret_mdspan_type = - mdspan; - return ret_mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map), - other_accessor}; - } -#endif // KOKKOS_ENABLE_IMPL_MDSPAN -}; - -template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { - return View::rank(); -} - -namespace Impl { - -template -struct RankDataType { - using type = typename RankDataType::type*; -}; - -template -struct RankDataType { - using type = ValueType; -}; - -template -KOKKOS_FUNCTION std::enable_if_t< - N == View::rank() && - std::is_same::specialize, void>::value, - View> -as_view_of_rank_n(View v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template -KOKKOS_FUNCTION std::enable_if_t< - N != View::rank() && - std::is_same::specialize, void>::value, - View::value_type, N>::type, - Args...>> -as_view_of_rank_n(View) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template -void apply_to_view_of_static_rank(Function&& f, View a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template -struct TypeListToViewTraits; - -template -struct TypeListToViewTraits> { - using type = ViewTraits; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list; - using memory_traits = typename ViewTraits::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first::type; - using new_memory_traits = - Kokkos::MemoryTraits; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list>::type; - - public: - using type = typename TypeListToViewTraits::type; -}; -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - -template -using Subview = decltype(subview(std::declval(), std::declval()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, - const View& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits; - using rhs_traits = ViewTraits; - - return std::is_same::value && - std::is_same::value && - std::is_same::value && - View::rank() == View::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template -KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, - const View& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct CommonViewValueType; - -template -struct CommonViewValueType { - using value_type = std::common_type_t; -}; - -template -struct CommonViewAllocProp; - -template -struct CommonViewAllocProp { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template -struct DeduceCommonViewAllocProp { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view::value }; - - using prop_type = CommonViewAllocProp; -}; - -template -struct DeduceCommonViewAllocProp { - using NextTraits = DeduceCommonViewAllocProp; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same::value && - !std::is_void::value && - !std::is_void::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same::value, first_specialize, - std::conditional_t<(std::is_void::value && - !std::is_void::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp; -}; - -} // end namespace Impl - -template -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType(views...); -} - -} // namespace Kokkos - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e..4d22634281 100644 --- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 99daf379b6..37fcfb7a1d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include #include +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include +#endif + #include +#include Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -59,11 +73,46 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -86,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index 5155bee33d..aee696bd34 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,9 +88,9 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED static bool in_parallel() { diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbd..75cef98a8d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb..1373f8fa7a 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v) { \ m_functor(static_cast(args)...); \ } else { \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd31..1dad499c1b 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index c3d7236872..343d9921a9 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,11 +30,12 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; + static int m_concurrency; int m_async_arg = acc_async_noval; OpenACCInternal() = default; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7b..629d26928e 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 5afb5e75d3..2b5631d6f8 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 430bdcb680..d4cb73164d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb..b1c48baa1e 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c3..95526aa784 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index a403909f67..aa4be87ceb 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -30,7 +30,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -93,11 +92,16 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); @@ -154,10 +158,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -202,7 +202,9 @@ struct MemorySpaceAccess #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 2877d940fa..6edcbff0c2 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include #include +#include + #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -360,6 +367,10 @@ extern template class TaskQueue #include #include -#include #include #include #include @@ -148,7 +147,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { #include #include #include -#include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ed625cfcc8..ec33d25b96 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,9 +92,9 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; @@ -141,79 +142,5 @@ class OpenMPTargetSpace { KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::Experimental::OpenMPTargetSpace); -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 0000000000..aace09e266 --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 6c5eb048e3..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; -std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 0000000000..13b509c0ad --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include +#include + +namespace Kokkos::Experimental::Impl { + +template +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v) { + m_functor(static_cast(args)...); + } else { + m_functor(WorkTag(), static_cast(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 44e9119ea8..53e723882f 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include #include #include #include #include -#include #include @@ -105,18 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -136,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -206,9 +271,9 @@ namespace Experimental { UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -221,7 +286,7 @@ UniqueTokenm_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -229,8 +294,7 @@ UniqueToken - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index e222d65250..f71f888713 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -116,8 +116,8 @@ class OpenMPTargetExecTeamMember { // FIXME_OPENMPTARGET this function currently ignores the reducer passed. template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, typename ReducerType::value_type& value) const - noexcept { + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier using value_type = typename ReducerType::value_type; @@ -741,43 +741,6 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static void* get_scratch_ptr(); - static void clear_scratch(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static std::mutex m_mutex_scratch_ptr; - static int64_t m_scratch_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d..38ed7c5681 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -20,6 +20,8 @@ #include #include #include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -31,38 +33,38 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; + using Policy = Kokkos::MDRangePolicy; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + const FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; public: inline void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); + Policy policy = m_policy; - typename Policy::point_type unused; static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); execute_tile( - unused, functor, policy, + m_functor, policy, std::integral_constant()); } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -72,18 +74,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i0 = begin_0; i0 < end_0; ++i0) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -96,10 +94,7 @@ class ParallelFor, for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -107,9 +102,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -125,10 +119,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -137,9 +128,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -158,11 +148,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -172,9 +158,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -197,12 +182,7 @@ class ParallelFor, for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } @@ -214,9 +194,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -226,18 +205,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i1 = begin_1; i1 < end_1; ++i1) for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -250,10 +225,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -261,9 +233,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -279,10 +250,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -291,9 +259,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -312,11 +279,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -326,9 +289,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -351,12 +313,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b..502461cc5e 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include #include #include +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter m_functor; const Policy m_policy; public: - void execute() const { execute_impl(); } + void execute() const { execute_impl(); } - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 26085f1140..77dc71a87b 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace Kokkos { @@ -76,28 +77,27 @@ class ParallelFor, using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + execute_impl(); } private: - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -105,11 +105,12 @@ class ParallelFor, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -122,7 +123,7 @@ class ParallelFor, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -161,16 +162,13 @@ class ParallelFor, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -180,10 +178,7 @@ class ParallelFor, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index e86a121974..bee604834c 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -37,9 +37,8 @@ class ParallelReduce; + public: inline void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr, + functor, m_policy, m_result_ptr, std::integral_constant()); } @@ -77,7 +81,7 @@ class ParallelReduce inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -90,32 +94,23 @@ class ParallelReduce::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -126,7 +121,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -141,38 +136,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -184,7 +170,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -201,40 +187,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -247,7 +222,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -266,26 +241,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -293,18 +260,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -318,7 +280,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -339,27 +301,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -368,19 +322,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -395,7 +344,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -408,32 +357,23 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -444,7 +384,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -459,38 +399,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -502,7 +433,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -519,40 +450,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -565,7 +485,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -584,26 +504,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -611,18 +523,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -636,7 +543,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -657,27 +564,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -686,19 +585,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4a112ed11d..b7c8abcb44 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,14 +54,17 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -75,26 +77,26 @@ class ParallelReduce, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 16c0eedb81..b81e3aa7ed 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 29df0163c8..ec8a96cb2f 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -145,12 +135,7 @@ class ParallelScan, #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,9 +153,9 @@ class ParallelScan, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; @@ -179,7 +163,7 @@ class ParallelScan, // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; @@ -231,7 +215,9 @@ class ParallelScanWithTotal, if (N > 0) { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View #include #include +#include namespace Kokkos { namespace Impl { @@ -72,7 +73,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -82,12 +82,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -104,33 +107,27 @@ struct ParallelReduceSpecialize, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template - static void execute_array(const FunctorType& f, const PolicyType& p, + template + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -150,27 +147,14 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -186,13 +170,10 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -200,12 +181,12 @@ struct ParallelReduceSpecialize, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -219,23 +200,25 @@ struct ParallelReduceSpecialize, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = static_cast( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -260,8 +243,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -279,11 +261,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -304,8 +282,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -344,7 +321,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -355,12 +331,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -370,9 +349,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -383,16 +364,15 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) KOKKOS_IMPL_OMPTARGET_PRAGMA( @@ -414,16 +394,13 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -433,10 +410,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -447,12 +421,12 @@ struct ParallelReduceSpecialize, } template - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -462,9 +436,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -473,7 +449,7 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -504,19 +480,14 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -531,10 +502,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -545,10 +513,10 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -562,10 +530,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -577,12 +542,12 @@ struct ParallelReduceSpecialize, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -611,13 +576,14 @@ struct ParallelReduceSpecialize, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -661,11 +627,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target @@ -673,7 +635,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { #pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 9b578aca11..4308fb042a 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -34,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -51,11 +48,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::sum(); @@ -72,11 +64,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::prod(); @@ -95,11 +82,6 @@ struct OpenMPTargetReducerWrapper> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::min(); @@ -118,11 +100,6 @@ struct OpenMPTargetReducerWrapper> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -141,11 +118,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::land(); @@ -166,11 +138,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::lor(); @@ -189,11 +156,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::band(); @@ -212,11 +174,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::bor(); @@ -236,12 +193,12 @@ struct OpenMPTargetReducerWrapper> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -263,12 +220,12 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -298,16 +255,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -331,22 +278,16 @@ struct OpenMPTargetReducerWrapper> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -385,15 +326,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::max(); @@ -428,15 +360,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::min(); @@ -480,23 +403,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -531,13 +437,6 @@ struct OpenMPTargetReducerWrapper> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity::min(); @@ -569,13 +468,6 @@ struct OpenMPTargetReducerWrapper> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity::max(); @@ -611,17 +503,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity::max(); @@ -654,13 +535,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity::min(); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue; - -//---------------------------------------------------------------------------- - -TaskExec::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization::execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization:: - iff_single_thread_recursive_execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using Member = TaskExec; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue; - using task_base_type = Kokkos::Impl::TaskBase; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, - count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 4de6931918..2583a1cdc0 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -100,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif #ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; #else @@ -172,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -261,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info() << "\nImage Max Buffer Size: " << device.get_info() - << "\nImage Max Array Size: " - << device.get_info() << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " << device.get_info() @@ -317,5 +318,4 @@ int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index 0f3d1f0994..937dcceab4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -91,9 +90,8 @@ class SYCL { /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -131,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits { +struct DeviceTypeTraits { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -185,10 +181,11 @@ std::vector partition_space(const SYCL& sycl_space, return instances; } +} // namespace Experimental + namespace Impl { std::vector get_sycl_devices(); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd38..a9e2eca4fb 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value && is_sycl_type_space::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + is_sycl_type_space::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp index 9c39df9415..54ca645995 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -32,30 +32,29 @@ namespace Impl { template -class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag< - PatternTag, Functor, PolicyType, Args..., - Kokkos::Experimental::SYCL>::type { +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { public: using Policy = PolicyType; using graph_kernel = GraphNodeKernelImpl; - using base_t = typename PatternImplSpecializationFromTag< - PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + using base_t = + typename PatternImplSpecializationFromTag::type; // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} template - GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, - Functor arg_functor, PolicyDeduced&& arg_policy) + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} + (PolicyDeduced&&)arg_policy) {} void set_sycl_graph_ptr( sycl::ext::oneapi::experimental::command_graph< @@ -102,14 +101,14 @@ template ::type> struct get_graph_node_kernel_type - : type_identity> {}; + : type_identity< + GraphNodeKernelImpl> {}; template struct get_graph_node_kernel_type : type_identity, Kokkos::ParallelReduceTag>> {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp index 6bbe6711a2..828f1cacb4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -28,7 +28,7 @@ namespace Kokkos { namespace Impl { template <> -struct GraphNodeBackendSpecificDetails { +struct GraphNodeBackendSpecificDetails { std::optional node; explicit GraphNodeBackendSpecificDetails() = default; @@ -38,16 +38,16 @@ struct GraphNodeBackendSpecificDetails { }; template -struct GraphNodeBackendDetailsBeforeTypeErasure { +struct GraphNodeBackendDetailsBeforeTypeErasure { protected: GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index 1dc4a9c997..dc63052dd7 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -31,29 +31,28 @@ namespace Kokkos { namespace Impl { template <> -class GraphImpl { +class GraphImpl { public: - using node_details_t = - GraphNodeBackendSpecificDetails; - using root_node_impl_t = GraphNodeImpl; + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; using aggregate_node_impl_t = - GraphNodeImpl; // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); - explicit GraphImpl(Kokkos::Experimental::SYCL instance); + explicit GraphImpl(Kokkos::SYCL instance); void add_node(std::shared_ptr const& arg_node_ptr); @@ -63,19 +62,25 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::SYCL& exec); - Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + Kokkos::SYCL const& get_execution_space() const noexcept; auto create_root_node_ptr(); template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } - Kokkos::Experimental::SYCL m_execution_space; + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; sycl::ext::oneapi::experimental::command_graph< sycl::ext::oneapi::experimental::graph_state::modifiable> m_graph; @@ -84,17 +89,16 @@ class GraphImpl { m_graph_exec; }; -inline GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl( - Kokkos::Experimental::SYCL instance) +inline GraphImpl::GraphImpl(Kokkos::SYCL instance) : m_execution_space(std::move(instance)), m_graph(m_execution_space.sycl_queue().get_context(), m_execution_space.sycl_queue().get_device()) {} -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // add an empty node that needs to be set up before finalizing the graph arg_node_ptr->node_details_t::node = m_graph.add(); @@ -103,7 +107,7 @@ inline void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in its policy template -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -122,7 +126,7 @@ inline void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -inline void GraphImpl::add_predecessor( +inline void GraphImpl::add_predecessor( NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); @@ -137,19 +141,19 @@ inline void GraphImpl::add_predecessor( m_graph.make_edge(*pred_node, *node); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::SYCL& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::Experimental::SYCL const& -GraphImpl::get_execution_space() const noexcept { +inline Kokkos::SYCL const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } -inline auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), _graph_node_is_root_ctor_tag{}); @@ -158,7 +162,7 @@ inline auto GraphImpl::create_root_node_ptr() { } template -inline auto GraphImpl::create_aggregate_ptr( +inline auto GraphImpl::create_aggregate_ptr( PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 5843dca812..5af1330d93 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -55,8 +53,8 @@ Kokkos::View sycl_global_unique_token_locks( SYCLInternal::~SYCLInternal() { if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -171,12 +169,12 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && @@ -184,9 +182,9 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -255,7 +253,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -265,8 +263,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - m_scratchSpace = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -276,7 +274,7 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -286,8 +284,8 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - m_scratchHost = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + m_scratchHost = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -297,7 +295,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -307,8 +305,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - m_scratchFlags = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, @@ -326,8 +324,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( template void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -364,8 +361,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); - m_data = - alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); @@ -396,5 +392,4 @@ template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 2d784ef8a5..c982154a9a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,10 +37,10 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); @@ -76,8 +75,9 @@ class SYCLInternal { mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); std::optional m_queue; // Using std::vector> reveals a compiler bug when @@ -102,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -119,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t>; + using AllocationSpace = + std::conditional_t>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -144,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -324,13 +323,12 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper> + Kokkos::Impl::SYCLFunctionWrapper> : std::true_type {}; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ @@ -352,8 +350,7 @@ static_assert( template struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper, + const Kokkos::Impl::SYCLFunctionWrapper, std::enable_if_t>>> : std::true_type {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc..9498513a3e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction { +struct default_outer_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction { +struct default_inner_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties( // Settings for TeamMDRangePolicy template -struct ThreadAndVectorNestLevel +struct ThreadAndVectorNestLevel : AcceleratorBasedNestLevel {}; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index cb7b1048da..3dbd63d81a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -180,12 +180,11 @@ class Kokkos::Impl::ParallelFor, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 8ef43d392c..da75f3e901 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -67,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; @@ -82,8 +82,8 @@ class Kokkos::Impl::ParallelFor, sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -160,13 +160,13 @@ class Kokkos::Impl::ParallelFor, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index cf7f582bc7..d8859cda9f 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -27,11 +27,11 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -52,8 +52,8 @@ class Kokkos::Impl::ParallelFor, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -146,11 +146,11 @@ class Kokkos::Impl::ParallelFor, scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); @@ -164,10 +164,14 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 0774b24bca..1e31354975 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,7 +76,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -85,7 +85,7 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -370,7 +368,7 @@ class Kokkos::Impl::ParallelReduce template -class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,7 +48,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -59,8 +58,8 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b443bcbf90..8f5310cbb2 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -29,7 +29,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -46,7 +46,7 @@ class Kokkos::Impl::ParallelReduce(m_scratch_size[1]) * m_league_size)); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, @@ -436,16 +434,21 @@ class Kokkos::Impl::ParallelReduce::accessible), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -461,7 +464,7 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bdb5b88377..ed7cee2805 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: @@ -161,8 +161,8 @@ class ParallelScanSYCLBase { sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -374,11 +374,11 @@ class ParallelScanSYCLBase { std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -399,7 +399,7 @@ class ParallelScanSYCLBase { template class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; @@ -417,13 +417,12 @@ class Kokkos::Impl::ParallelScan, template class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { @@ -445,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible), m_exec(arg_policy.space()) {} }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 19fad29150..022f88e0a8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -33,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -46,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -60,12 +59,9 @@ namespace { std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { switch (allocation_kind) { - case sycl::usm::alloc::host: - return Kokkos::Experimental::SYCLHostUSMSpace::name(); - case sycl::usm::alloc::device: - return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); - case sycl::usm::alloc::shared: - return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); default: Kokkos::abort("bug: unknown sycl allocation type"); return "unreachable"; @@ -75,7 +71,6 @@ std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { } // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -114,12 +109,12 @@ void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { @@ -244,7 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental } // namespace Kokkos //============================================================================== @@ -253,11 +247,11 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, #include KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); + Kokkos::SYCLDeviceUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLSharedUSMSpace); + Kokkos::SYCLSharedUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index b86cfca413..5a37da130c 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -154,45 +152,40 @@ class SYCLHostUSMSpace { sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -200,26 +193,24 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -227,14 +218,11 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -243,16 +231,15 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -264,47 +251,38 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace> { +struct MemorySpaceAccess> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -315,11 +293,9 @@ struct MemorySpaceAccess< } // namespace Kokkos KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLSharedUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); #endif #endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 1e42faa5a8..6359e4a2d9 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -34,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -126,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -139,7 +153,7 @@ class SYCLTeamMember { if (vector_range * shift < sub_group_range) { const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } }; shuffle_combine(1); @@ -153,14 +167,13 @@ class SYCLTeamMember { shift <<= 1) { auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } #endif value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { - reducer.reference() = value; return; } @@ -187,16 +200,15 @@ class SYCLTeamMember { for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && group_id < std::min(start + step_width, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } // Do the final reduction for all threads redundantly value = reduction_array[0]; for (int i = 1; i < std::min(step_width, n_subgroups); ++i) - reducer.join(value, reduction_array[i]); + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = value; // Make sure that every thread is done using the reduction array. sycl::group_barrier(m_item.get_group()); } @@ -271,8 +283,8 @@ class SYCLTeamMember { const auto update = Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -311,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -319,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -336,8 +361,7 @@ class SYCLTeamMember { tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( sg, tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; } //---------------------------------------- @@ -531,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -541,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -557,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -657,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -670,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -679,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -692,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -746,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -774,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058b..556ca0d281 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include template -class Kokkos::Impl::TeamPolicyInternal +class Kokkos::Impl::TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal TeamPolicyInternal(TeamPolicyInternal const& p) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84b..79d9e8a8d4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken public: UniqueToken() : UniqueToken( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken(max_size) {} diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 61db6b34aa..2905733a4d 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,12 +23,11 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index 81d43b31b3..a1fa9e43e0 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -34,7 +34,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -267,7 +266,7 @@ template std::vector partition_space(const Serial&, std::vector const& weights) { static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -284,7 +283,9 @@ std::vector partition_space(const Serial&, #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 34e115eca9..addcaba009 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -44,11 +44,16 @@ class ParallelFor, public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->exec(); } template @@ -112,10 +117,15 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 80faec9041..2ab7b7f803 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -31,7 +31,7 @@ class ParallelFor, Kokkos::Serial> { const Policy m_policy; template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor, Kokkos::Serial> { } template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,10 +49,15 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_policy.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->template exec(); } @@ -79,7 +84,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -88,7 +93,7 @@ class ParallelReduce, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; @@ -108,10 +113,15 @@ class ParallelReduce, auto* internal_instance = m_policy.space().impl_internal_space_instance(); + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -166,7 +176,7 @@ class ParallelScan, const Policy m_policy; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -175,7 +185,7 @@ class ParallelScan, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -194,10 +204,16 @@ class ParallelScan, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -235,7 +251,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -244,7 +260,7 @@ class ParallelScanWithTotal, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -262,10 +278,16 @@ class ParallelScanWithTotal, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a523cc86c9..7a6faf3d9f 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -223,7 +223,7 @@ class ParallelFor, const size_t m_shared; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,10 +247,16 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -293,7 +299,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -301,7 +307,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -321,10 +327,16 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index 5905d6d32e..678d182504 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization> { template class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t::value>> { + Scheduler, std::enable_if_t>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -215,6 +220,10 @@ extern template class TaskQueue, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 6ad6aabc5a..527e094079 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -31,15 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template +template <> struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - View> { - ZeroMemset(const Serial&, const View& dst) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } + std::conditional_t, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 3842966cd7..edc9489f67 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -67,8 +67,9 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -void (*volatile s_current_function)(ThreadsInternal &, const void *); -const void *volatile s_current_function_arg = nullptr; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic s_current_function; +std::atomic s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -79,7 +80,7 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } -void wait_yield(volatile ThreadState &flag, const ThreadState value) { +void wait_yield(std::atomic &flag, const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -135,11 +136,12 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && @@ -543,7 +545,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a5eb231cb0..130b3433d0 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -60,7 +60,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - ThreadState volatile m_pool_state; ///< State for global synchronizations + std::atomic m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -96,7 +96,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -225,7 +225,7 @@ class ThreadsInternal { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -403,7 +403,7 @@ class ThreadsInternal { static void start(void (*)(ThreadsInternal &, const void *), const void *); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); + static int in_parallel(); #endif static void fence(); static void fence(const std::string &); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 59577609ab..711b1b6926 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -51,7 +51,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -65,7 +65,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 4a89c4fad8..25aab9ebfb 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -64,7 +64,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -77,7 +77,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index f927d7c6a6..40be3884c3 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor, const size_t m_shared; template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -88,8 +88,12 @@ class ParallelFor, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index fa63215a9e..9f28f9bbfc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -59,7 +59,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -76,7 +76,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -91,8 +91,8 @@ class ParallelReduce(instance.reduce_memory())); + reference_type update = + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -100,7 +100,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -73,7 +73,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), @@ -89,7 +89,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 4db310701f..69527ee3e6 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -106,9 +106,14 @@ class ParallelReduce could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 62f34d741f..d54f4ca952 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -119,7 +119,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 3df9dc07bf..0f9a77f2af 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -108,7 +108,7 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value) { Kokkos::store_fence(); uint32_t i = 0; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index b98b6dbb73..7ab43cdb7a 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,7 +35,7 @@ enum class WaitMode : int { void host_thread_yield(const uint32_t i, const WaitMode mode); -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value); } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index a3501a437d..f627e0d47a 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,7 +186,7 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c::type; + std::conditional_t; if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -215,52 +215,65 @@ class ThreadsExecTeamMember { } template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c::type; + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - type* const local_value = ((type*)m_instance->scratch_memory()); + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t; - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + type* const local_value = ((type*)m_instance->scratch_memory()); - // Fence to make sure the base team member has access: - memory_fence(); + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + // Fence to make sure the base team member has access: + memory_fence(); - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Fence to make sure all team members have access - memory_fence(); - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - team_fan_out(); + // Fence to make sure all team members have access + memory_fence(); + } - // Value was changed by the team base - reducer.reference() = *local_value;)) + team_fan_out(); + + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -278,8 +291,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; volatile type* const work_value = ((type*)m_instance->scratch_memory()); @@ -887,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template @@ -907,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -950,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template @@ -962,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -1049,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index c88d66db5f..5fed92db26 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -36,13 +36,13 @@ class ParallelFor, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/View/Kokkos_BasicView.hpp b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 0000000000..29eafca62e --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl::type; + +template +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template +struct is_layout_right_padded : public std::false_type {}; + +template +struct is_layout_right_padded> + : public std::true_type {}; + +template +struct is_layout_left_padded : public std::false_type {}; + +template +struct is_layout_left_padded> + : public std::true_type {}; + +template +class BasicView { + public: + using mdspan_type = + mdspan; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v) { + if constexpr (Impl::is_layout_left_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template + // requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast(std::move(exts))...)), + m_acc{} {} + + template + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v>, + data_handle_type> + p, + const Array &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v && + std::is_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template +// requires(std::is_constructible_v::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + template +// requires(std::is_constructible_v>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template + data_handle_type create_data_handle( + const Impl::ViewCtorProp &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record( + arg_mapping.required_span_size(), + Impl::get_property(prop_copy), + Impl::get_property(prop_copy), + has_exec ? std::optional{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional{std::nullopt}, + std::integral_constant(), + std::integral_constant())); + } + + public: + template + // requires(!Impl::ViewCtorProp::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template + // requires(Impl::ViewCtorProp::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property(arg_prop)), + arg_mapping) {} + + protected: + template + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template , + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template >> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), static_cast(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#endif + + // C++20 operator() + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v && ...)); + static_assert( + (std::is_nothrow_constructible_v && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#else + template + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v && ...)) && + ((std::is_nothrow_constructible_v && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 1ade75692f..eb11630b21 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -26,6 +26,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -41,22 +42,8 @@ bool is_zero_byte(const T& x) { return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - template -struct ViewValueFunctor { +struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; @@ -68,20 +55,31 @@ struct ViewValueFunctor { std::string name; bool default_exec_space; - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { + template + KOKKOS_FUNCTION + std::enable_if_t> + operator()(ConstructTag, const size_t i) const { new (ptr + i) ValueType(); } - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else (ptr + i)->~ValueType(); +#endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, @@ -104,49 +102,6 @@ struct ViewValueFunctor { functor_instantiate_workaround(); } - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - template void parallel_for_implementation() { using PolicyType = @@ -172,24 +127,62 @@ struct ViewValueFunctor { const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } + if (default_exec_space || std::is_same_v) { + space.fence(std::is_same_v + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } } - void construct_shared_allocation() { construct_dispatch(); } + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation(); + } void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { #ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - if constexpr (std::is_same_v) - for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); - else + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else #endif - { - parallel_for_implementation(); + parallel_for_implementation(); } } @@ -206,114 +199,6 @@ struct ViewValueFunctor { } }; -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - template struct ViewValueFunctorSequentialHostInit { using ExecSpace = typename DeviceType::execution_space; @@ -358,6 +243,63 @@ struct ViewValueFunctorSequentialHostInit { } }; +template +Kokkos::Impl::SharedAllocationRecord* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional exec_space, + std::bool_constant, std::bool_constant) { + static_assert(SpaceAccessibility::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c..f77066b70f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include #include @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 84% rename from lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 379180ae64..f080474717 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -72,8 +72,8 @@ struct ViewCtorProp {}; */ template struct ViewCtorProp> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp; @@ -92,8 +92,8 @@ struct ViewCtorProp || std::is_same_v || std::is_same_v>, P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -106,14 +106,14 @@ struct ViewCtorProp || /* Map input label type to std::string */ template struct ViewCtorProp::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -122,8 +122,8 @@ template struct ViewCtorProp::value || Kokkos::is_execution_space::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -135,8 +135,8 @@ struct ViewCtorProp::value || template struct ViewCtorProp { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -213,14 +213,19 @@ struct ViewCtorProp : public ViewCtorProp... { using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template - inline ViewCtorProp(Args const &... args) : ViewCtorProp(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template , Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp(std::forward(args))... {} template - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp(arg0), ViewCtorProp::type>(args)... {} @@ -252,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -302,7 +307,7 @@ template struct WithPropertiesIfUnset, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -328,7 +333,7 @@ struct WithPropertiesIfUnset, Property, Properties...> { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -437,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward(args)...); +} + +template +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index 04c0c9aeed..37b6e2802f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -60,8 +60,8 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ }; \ template \ @@ -72,8 +72,8 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ @@ -149,8 +149,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; ViewDimension& operator=(const ViewDimension&) = default; KOKKOS_INLINE_FUNCTION @@ -370,8 +370,7 @@ struct ViewDataAnalysis { // ValueType is opportunity for partial specialization. // Must match array analysis when this default template is used. static_assert( - std::is_same::value); + std::is_same_v); public: using specialize = void; // No specialization diff --git a/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 0000000000..fd406d58cc --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#include +#endif +#include + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, double* + * indicates a one-dimensional array of \c double with run-time + * dimension, and int*[3] a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * Space. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View out, + * View in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template +struct is_always_assignable_impl; + +template +struct is_always_assignable_impl, + Kokkos::View> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View::traits, + typename Kokkos::View::traits, + typename Kokkos::View::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast(Kokkos::View::rank_dynamic) >= + static_cast(Kokkos::View::rank_dynamic); +}; + +template +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t, + std::remove_const_t>>; + +template +inline constexpr bool is_always_assignable_v = + is_always_assignable::value; + +template +constexpr bool is_assignable(const Kokkos::View& dst, + const Kokkos::View& src) { + using DstTraits = typename Kokkos::View::traits; + using SrcTraits = typename Kokkos::View::traits; + using mapping_type = + Kokkos::Impl::ViewMapping; + + return is_always_assignable_v, + Kokkos::View> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +class View; + +template +struct is_view : public std::false_type {}; + +template +struct is_view> : public std::true_type {}; + +template +struct is_view> : public std::true_type {}; + +template +inline constexpr bool is_view_v = is_view::value; + +template +class View : public ViewTraits { + private: + template + friend class View; + template + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker; + + public: + using traits = ViewTraits; + + private: + using map_type = + Kokkos::Impl::ViewMapping; + template + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View; + + /** \brief Compatible view of const data type */ + using const_type = + View; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType::type; + using uniform_const_type = + typename Impl::ViewUniformType::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> + extent_int(const iType& r) const noexcept { + return static_cast(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v; + + static constexpr bool is_layout_right = + std::is_same_v; + + static constexpr bool is_layout_stride = + std::is_same_v; + + static constexpr bool is_default_map = + std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); + } + + template + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View( + const View& rhs, + std::enable_if_t::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View& rhs) { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View; + + using Mapping = Kokkos::Impl::ViewMapping; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp( + reinterpret_cast(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { + return View::rank(); +} + +namespace Impl { + +template +struct RankDataType { + using type = typename RankDataType::type*; +}; + +template +struct RankDataType { + using type = ValueType; +}; + +template +KOKKOS_FUNCTION std::enable_if_t< + N == View::rank() && + std::is_same_v::specialize, void>, + View> +as_view_of_rank_n(View v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template +KOKKOS_FUNCTION std::enable_if_t< + N != View::rank() && + std::is_same_v::specialize, void>, + View::value_type, N>::type, + Args...>> +as_view_of_rank_n(View) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template +void apply_to_view_of_static_rank(Function&& f, View a) { + f(a); +} + +} // namespace Impl + +template +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} +#endif + +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, + const View& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits; + using rhs_traits = ViewTraits; + + return std::is_same_v && + std::is_same_v && + std::is_same_v && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template +KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, + const View& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +struct CommonViewValueType; + +template +struct CommonViewValueType { + using value_type = std::common_type_t; +}; + +template +struct CommonViewAllocProp; + +template +struct CommonViewAllocProp { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template +struct DeduceCommonViewAllocProp { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view::value }; + + using prop_type = CommonViewAllocProp; +}; + +template +struct DeduceCommonViewAllocProp { + using NextTraits = DeduceCommonViewAllocProp; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v && + !std::is_void_v && + !std::is_void_v), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t, + first_specialize, + std::conditional_t<(std::is_void_v && + !std::is_void_v), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp; +}; + +} // end namespace Impl + +template +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType(views...); +} + +} // namespace Kokkos + +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 90% rename from lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 10aaa63b7c..ecc19eaf5e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -28,61 +28,41 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include #include -#include +#include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - namespace Kokkos { namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; // Assuming '2 == initializer_list::size()' template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template @@ -93,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type::value }; - static_assert(value || std::is_integral::value || - std::is_void::value, + static_assert(value || std::is_integral_v || std::is_void_v, "subview argument must be either integral or integral extent"); }; @@ -112,16 +91,16 @@ struct SubviewLegalArgsCompileTime { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type::value)) || - ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type::value))) && - (SubviewLegalArgsCompileTime::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v)) || + ((CurrentArg < RankDest) && (std::is_same_v)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type::value))) && + (SubviewLegalArgsCompileTime::value) }; }; @@ -129,7 +108,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v)) && (CurrentArg == RankSrc - 1) }; }; @@ -144,10 +123,9 @@ struct SubviewLegalArgsCompileTime::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same_v))) && (SubviewLegalArgsCompileTime::value) @@ -158,8 +136,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v)) }; }; @@ -392,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -718,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -885,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1071,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1086,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding::stride(arg_layout.dimension[0])) { + } template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1407,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1565,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1614,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1624,19 +1612,21 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; @@ -1749,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding::stride(value); + } + + public: template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant const&, @@ -1764,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride(arg_layout)) {} template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1886,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1901,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1916,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1931,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1946,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1961,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1976,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1991,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2005,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +2261,8 @@ struct ViewOffset { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2398,9 +2376,9 @@ struct ViewDataHandle { template struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && + std::enable_if_t<(std::is_same_v && + std::is_void_v && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle; @@ -2422,11 +2400,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2446,11 +2423,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2485,11 +2461,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2533,11 +2508,10 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void::value && - ViewOffset::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v && + ViewOffset::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset; @@ -2680,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -2780,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -2894,29 +2866,34 @@ template class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void::value && - std::is_void::value && + std::is_void_v && + std::is_void_v && ( // same layout - std::is_same::value || + std::is_same_v || // known layout - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value))))>> { + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -2926,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -2939,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same::value || - std::is_same::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v || + std::is_same_v || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3032,22 +3009,21 @@ class ViewMapping< template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - std::is_void::value && - ( - // same layout - std::is_same::value || - // known layout - (std::is_same::value || - std::is_same::value || - std::is_same::value)))>> { + std::enable_if_t<(std::is_same_v && + std::is_void_v && + std::is_void_v && + ( + // same layout + std::is_same_v || + // known layout + (std::is_same_v || + std::is_same_v || + std::is_same_v)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3057,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -3091,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same::value) { + if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3100,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same::value) { + } else if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3197,8 +3172,8 @@ struct SubViewDataTypeImpl> { template struct SubViewDataTypeImpl< - std::enable_if_t>::value>, - ValueType, Kokkos::Experimental::Extents, Integral, Args...> + std::enable_if_t>>, ValueType, + Kokkos::Experimental::Extents, Integral, Args...> : SubViewDataTypeImpl, Args...> {}; @@ -3230,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl {}; template class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, + std::enable_if_t<( + std::is_void_v && + (std::is_same_v || + std::is_same_v || + std::is_same_v))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3292,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same::value) // replace with input rank + std::is_same_v) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same::value) // replace input rank + std::is_same_v) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 0000000000..5eddfc68e0 --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include +#include +#include +#include +#include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template +struct ViewArrayAnalysis; + +template ::non_const_value_type> +struct ViewDataAnalysis; + +template +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout::type; + using accessor_type = + SpaceAwareAccessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template +struct ViewTraits; + +template <> +struct ViewTraits { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = HooksPolicy; +}; + +template +struct ViewTraits::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::HostMirrorSpace, + void> && + std::is_same_v::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::array_layout, + void> && + std::is_same_v::memory_traits, + void> && + std::is_same_v::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits; + + using ExecutionSpace = + std::conditional_t, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror::Space>; + + using MemoryTraits = + std::conditional_t, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d..1e47613285 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template struct ViewScalarToDataType { using type = typename ViewScalarToDataType::type *; + using const_type = + typename ViewScalarToDataType::const_type *; }; template struct ViewScalarToDataType { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template @@ -49,12 +52,13 @@ struct ViewUniformLayout { template struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType::const_type; using array_layout = typename ViewUniformLayout { } KOKKOS_FUNCTION - constexpr typename offset_policy::data_handle_type offset(data_handle_type p, - size_t i) const - noexcept { + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { return nested_acc.offset(p, i); } @@ -214,6 +212,199 @@ struct AtomicAccessorRelaxed { } }; +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template + friend class ReferenceCountedDataHandle; + + template + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template +class ReferenceCountedAccessor; + +template +struct IsReferenceCountedAccessor : std::false_type {}; + +template +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor> + : std::true_type {}; + +template +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + std::is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)&&std:: + is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template >> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor&) {} + + template ::value && + std::is_convertible_v>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor>>; + +template +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor>; + +template +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor>>; + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp index 089628137d..f990d158bf 100644 --- a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -23,7 +23,11 @@ static_assert(false, #define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP #include "Kokkos_MDSpan_Extents.hpp" -#include +#include + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. namespace Kokkos::Impl { @@ -77,32 +81,7 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 7 ? mapping.stride(7) : 0, }; } else { - // FIXME: Kokkos Layouts don't store stride (it's in the mapping) - // We could conceivably fix this by adding an extra ViewCtorProp for - // an abritrary padding. For now we will check for this. - if constexpr (rank > 1 && - (std::is_same_v> || - std::is_same_v>)) { - [[maybe_unused]] constexpr size_t strided_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 1 - : rank - 2; - [[maybe_unused]] constexpr size_t extent_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 0 - : rank - 1; - KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); - } - - return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -110,12 +89,98 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; } #ifdef KOKKOS_COMPILER_INTEL __builtin_unreachable(); #endif } +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v || + std::is_same_v) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence) { + static_assert( + std::is_same_v); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast( + dextents{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl( + layout, std::make_index_sequence()); +} + template KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { using mapping_type = typename MDSpanType::mapping_type; diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211..79c137bfdd 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index d13c90825c..3570ed2b6e 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -35,6 +35,16 @@ #include #include #include + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f865..399b986041 100644 --- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b..a9db2c4cf4 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { + std::enable_if_t>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> apply(Args&&... args) const { m_func(args...); } template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && - !std::is_array::value>> { + std::enable_if_t && + !std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile struct HostIterateTile::value && - std::is_array::value>> { + std::enable_if_t && + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3..e6b2fcbef4 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template , Args&&... args) { - _tag_invoke(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke(f, vals[Idxs]..., (Args&&)args...); } template @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper(f, vals, std::make_index_sequence{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c753..b483653021 100644 --- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait{}; static_assert( - /* always false: */ std::is_void::value, + /* always false: */ std::is_void_v, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b205..4ea0b8d343 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( - non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + non_owning_variable_size_circular_buffer& operator=( + non_owning_variable_size_circular_buffer const&) = delete; + non_owning_variable_size_circular_buffer& operator=( + non_owning_variable_size_circular_buffer&&) = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template ::value>> + std::is_default_constructible_v>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd6..ee53fd8bc6 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c6339..d7319e80c8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl, std::move(arg_values))... {} template - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl::ref(); } template @@ -181,7 +181,7 @@ struct CombinedReducerImpl, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl, Space, template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same, value_type>::value> + !std::is_same_v, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t...>; return reducer_type(value, _reducer_from_arg_t{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c3..9bde2f72a3 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 532709aa98..72f33ffaab 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -138,7 +138,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return Kokkos::Experimental::Impl::get_sycl_devices().size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -183,7 +183,7 @@ std::vector const& Kokkos::Impl::get_visible_devices() { #elif defined(KOKKOS_ENABLE_OPENMPTARGET) int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - int device = Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else int device = -1; return device; @@ -271,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -284,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -308,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -324,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -332,7 +332,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); @@ -613,7 +613,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #endif declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -666,6 +666,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_RISCV_SG2042) declare_configuration_metadata("architecture", "CPU architecture", "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -738,8 +741,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -976,7 +979,7 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac..cd00fdadeb 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag::type, - public GraphNodeKernelDefaultImpl { + : public GraphNodeKernelDefaultImpl, + public PatternImplSpecializationFromTag::type { public: using base_t = typename PatternImplSpecializationFromTag - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab..31d147ea89 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 05d4854919..8dfa19a178 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -58,12 +58,12 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -136,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4cc..a8a4d6617b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template class CtorNotOnDevice> struct EBOBaseImpl { template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && !CtorNotOnDevice::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && CtorNotOnDevice::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl { T m_ebo_object; template ::value && + std::enable_if_t && !CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} template ::value && + std::enable_if_t && CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl { template class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl::value, CtorsNotOnDevice> { + : EBOBaseImpl, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22..58a5de2aa6 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a..5805b78ee7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space()); static_assert(check_is_regular()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include -#include - -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr m_first_block = nullptr; - Kokkos::OwningRawPtr m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e..29a365e6e4 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same::value, + std::is_same_v, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void::type> { using type = typename F::value_type; - static_assert(!std::is_reference::value && - std::rank::value <= 1 && - std::extent::value == 0, + static_assert(!std::is_reference_v && std::rank_v <= 1 && + std::extent_v == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template ::type, - bool T = std::is_void::value> + bool T = std::is_void_v> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type::type; enum { - candidate_is_void = std::is_void::value, - candidate_is_array = std::rank::value == 1 + candidate_is_void = std::is_void_v, + candidate_is_array = std::rank_v == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t; - static_assert(!std::is_const::value, + static_assert(!std::is_const_v, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template - struct DeduceJoinNoTag::value || - (!is_reducer::value && - std::is_void::value)) && - detected_join_no_tag::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + detected_join_no_tag::value>> : public has_join_no_tag_function { enum : bool { value = true }; }; template struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer::value || - (!is_reducer::value && std::is_void::value)) && - (!detected_join_no_tag::value && - detected_volatile_join_no_tag::value)>> + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + (!detected_join_no_tag::value && + detected_volatile_join_no_tag::value)>> : public has_volatile_join_no_tag_function { enum : bool { value = true }; static_assert(Impl::dependent_false_v, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template struct DeduceInitNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_init_no_tag_function::enable_if( &F::init))>> : public has_init_no_tag_function { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template struct DeduceFinalNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_final_no_tag_function::enable_if( &F::final))>> : public has_final_no_tag_function { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return m_functor.value_count; } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d..6d3ebf64be 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of::value, "Kokkos Internal Error in graph interface"); - return std::make_shared((Args &&) args...); + return std::make_shared((Args&&)args...); } template ::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // end accessors for private members of public interface }}}2 diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e4..b02a265472 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template