diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000000..827306c9aa --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,47 @@ +# GitHub action to run static code analysis on C++ and Python code +name: "CodeQL Code Analysis" + +on: + push: + branches: [master] + +jobs: + analyze: + name: Analyze + if: ${{ github.repository == 'lammps/lammps' }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + language: ['cpp', 'python'] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + with: + languages: ${{ matrix.language }} + + - name: Create Build Environment + run: cmake -E make_directory ${{github.workspace}}/build + + - name: Building LAMMPS via CMake + if: ${{ matrix.language == 'cpp' }} + shell: bash + working-directory: ${{github.workspace}}/build + run: | + cmake -C $GITHUB_WORKSPACE/cmake/presets/most.cmake $GITHUB_WORKSPACE/cmake + cmake --build . --parallel 2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 diff --git a/.github/workflows/unittest-macos.yml b/.github/workflows/unittest-macos.yml new file mode 100644 index 0000000000..a65db7636b --- /dev/null +++ b/.github/workflows/unittest-macos.yml @@ -0,0 +1,34 @@ +# GitHub action to build LAMMPS on MacOS and run unit tests +name: "Unittest for MacOS" + +on: + push: + branches: [master] + +jobs: + build: + name: MacOS Unit Test + if: ${{ github.repository == 'lammps/lammps' }} + runs-on: macos-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 2 + + - name: Create Build Environment + run: cmake -E make_directory ${{github.workspace}}/build + + - name: Building LAMMPS via CMake + shell: bash + working-directory: ${{github.workspace}}/build + run: | + cmake -C $GITHUB_WORKSPACE/cmake/presets/most.cmake $GITHUB_WORKSPACE/cmake \ + -DENABLE_TESTING=ON -DBUILD_SHARED_LIBS=ON -DLAMMPS_EXCEPTIONS=ON + cmake --build . --parallel 2 + + - name: Run Tests + working-directory: ${{github.workspace}}/build + shell: bash + run: ctest -V diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index c29fba5957..a4736740cf 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -107,13 +107,15 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF) set(STANDARD_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS DIPOLE GRANULAR KSPACE LATTE MANYBODY MC MESSAGE MISC MLIAP MOLECULE PERI POEMS QEQ REPLICA RIGID SHOCK SPIN SNAP SRD KIM PYTHON MSCG MPIIO VORONOI - USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-MESODPD USER-CGSDK USER-COLVARS - USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF USER-FEP USER-H5MD USER-LB - USER-MANIFOLD USER-MEAMC USER-MESONT USER-MGPT USER-MISC USER-MOFFF USER-MOLFILE - USER-NETCDF USER-PHONON USER-PLUMED USER-PTM USER-QTB USER-REACTION - USER-REAXC USER-SCAFACOS USER-SDPD USER-SMD USER-SMTBQ USER-SPH USER-TALLY - USER-UEF USER-VTK USER-QUIP USER-QMMM USER-YAFF USER-ADIOS) -set(SUFFIX_PACKAGES CORESHELL USER-OMP KOKKOS OPT USER-INTEL GPU) + USER-ADIOS USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-MESODPD USER-CGSDK + USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF USER-FEP USER-H5MD + USER-LB USER-MANIFOLD USER-MEAMC USER-MESONT USER-MGPT USER-MISC USER-MOFFF + USER-MOLFILE USER-NETCDF USER-PHONON USER-PLUMED USER-PTM USER-QTB + USER-REACTION USER-REAXC USER-SCAFACOS USER-SDPD USER-SMD USER-SMTBQ USER-SPH + USER-TALLY USER-UEF USER-VTK USER-QUIP USER-QMMM USER-YAFF) + +set(SUFFIX_PACKAGES CORESHELL GPU KOKKOS OPT USER-INTEL USER-OMP) + foreach(PKG ${STANDARD_PACKAGES} ${SUFFIX_PACKAGES}) option(PKG_${PKG} "Build ${PKG} Package" OFF) endforeach() diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 19cc401c3d..63895a5a9c 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -1,4 +1,7 @@ ######################################################################## +# As of version 3.3.0 Kokkos requires C++14 +set(CMAKE_CXX_STANDARD 14) +######################################################################## # consistency checks and Kokkos options/settings required by LAMMPS if(Kokkos_ENABLE_CUDA) message(STATUS "KOKKOS: Enabling CUDA LAMBDA function support") @@ -35,8 +38,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) ExternalProject_Add(kokkos_build - URL https://github.com/kokkos/kokkos/archive/3.2.01.tar.gz - URL_MD5 ba72440e285ccde05b403694ea0c92e5 + URL https://github.com/kokkos/kokkos/archive/3.3.01.tar.gz + URL_MD5 08201d1c7cf5bc458ce0f5b44a629d5a CMAKE_ARGS ${KOKKOS_LIB_BUILD_ARGS} BUILD_BYPRODUCTS /lib/libkokkoscore.a ) @@ -50,7 +53,7 @@ if(DOWNLOAD_KOKKOS) target_link_libraries(lammps PRIVATE LAMMPS::KOKKOS) add_dependencies(LAMMPS::KOKKOS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 3.2.01 REQUIRED CONFIG) + find_package(Kokkos 3.3.01 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) else() set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) diff --git a/doc/Makefile b/doc/Makefile index 041c7a372a..6032aff45f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -94,7 +94,7 @@ $(SPHINXCONFIG)/conf.py: $(SPHINXCONFIG)/conf.py.in -e 's,@LAMMPS_PYTHON_DIR@,$(BUILDDIR)/../python,g' \ -e 's,@LAMMPS_DOC_DIR@,$(BUILDDIR),g' $< > $@ -html: xmlgen $(SPHINXCONFIG)/conf.py $(ANCHORCHECK) $(MATHJAX) +html: xmlgen $(VENV) $(SPHINXCONFIG)/conf.py $(ANCHORCHECK) $(MATHJAX) @if [ "$(HAS_BASH)" == "NO" ] ; then echo "bash was not found at $(OSHELL)! Please use: $(MAKE) SHELL=/path/to/bash" 1>&2; exit 1; fi @$(MAKE) $(MFLAGS) -C graphviz all @(\ @@ -118,7 +118,7 @@ html: xmlgen $(SPHINXCONFIG)/conf.py $(ANCHORCHECK) $(MATHJAX) @rm -rf html/PDF/.[sg]* @echo "Build finished. The HTML pages are in doc/html." -spelling: xmlgen $(VENV) $(SPHINXCONFIG)/false_positives.txt +spelling: xmlgen $(SPHINXCONFIG)/conf.py $(VENV) $(SPHINXCONFIG)/false_positives.txt @if [ "$(HAS_BASH)" == "NO" ] ; then echo "bash was not found at $(OSHELL)! Please use: $(MAKE) SHELL=/path/to/bash" 1>&2; exit 1; fi @(\ . $(VENV)/bin/activate ; env PYTHONWARNINGS= \ diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in index f8a5bc6cdb..49a271355f 100644 --- a/doc/doxygen/Doxyfile.in +++ b/doc/doxygen/Doxyfile.in @@ -424,6 +424,8 @@ INPUT = @LAMMPS_SOURCE_DIR@/utils.cpp \ @LAMMPS_SOURCE_DIR@/input.h \ @LAMMPS_SOURCE_DIR@/tokenizer.cpp \ @LAMMPS_SOURCE_DIR@/tokenizer.h \ + @LAMMPS_SOURCE_DIR@/arg_info.cpp \ + @LAMMPS_SOURCE_DIR@/arg_info.h \ @LAMMPS_SOURCE_DIR@/text_file_reader.cpp \ @LAMMPS_SOURCE_DIR@/text_file_reader.h \ @LAMMPS_SOURCE_DIR@/potential_file_reader.cpp \ diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index f82c6c8d04..0c048c53ff 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -521,11 +521,14 @@ They must be specified in uppercase. * - VEGA906 - GPU - AMD GPU MI50/MI60 GFX906 + * - VEGA908 + - GPU + - AMD GPU GFX908 * - INTEL_GEN - GPU - Intel GPUs Gen9+ -This list was last updated for version 3.2 of the Kokkos library. +This list was last updated for version 3.3 of the Kokkos library. .. tabs:: diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 2ca0e88729..f5b1ef9b38 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -263,6 +263,7 @@ OPT. * :doc:`ufm (got) ` * :doc:`vashishta (gko) ` * :doc:`vashishta/table (o) ` + * :doc:`wf/cut ` * :doc:`yukawa (gko) ` * :doc:`yukawa/colloid (go) ` * :doc:`zbl (gko) ` diff --git a/doc/src/Commands_parse.rst b/doc/src/Commands_parse.rst index 37283823d7..64d5100715 100644 --- a/doc/src/Commands_parse.rst +++ b/doc/src/Commands_parse.rst @@ -162,3 +162,26 @@ LAMMPS: triple quotes can be nested in the usual manner. See the doc pages for those commands for examples. Only one of level of nesting is allowed, but that should be sufficient for most use cases. + +.. admonition:: ASCII versus UTF-8 + :class: note + + LAMMPS expects and processes 7-bit ASCII format text internally. + Many modern environments use UTF-8 encoding, which is a superset + of the 7-bit ASCII character table and thus mostly compatible. + However, there are several non-ASCII characters that can look + very similar to their ASCII equivalents or are invisible (so they + look like a blank), but are encoded differently. Web browsers, + PDF viewers, document editors are known to sometimes replace one + with the other for a better looking output. However, that can + lead to problems, for instance, when using cut-n-paste of input + file examples from web pages, or when using a document editor + (not a dedicated plain text editor) for writing LAMMPS inputs. + LAMMPS will try to detect this and substitute the non-ASCII + characters with their ASCII equivalents where known. There also + is going to be a warning printed, if this occurs. It is + recommended to avoid such characters altogether in LAMMPS input, + data and potential files. The replacement tables are likely + incomplete and dependent on users reporting problems processing + correctly looking input containing UTF-8 encoded non-ASCII + characters. diff --git a/doc/src/Developer_org.rst b/doc/src/Developer_org.rst index c234cd11cc..6ecccf084d 100644 --- a/doc/src/Developer_org.rst +++ b/doc/src/Developer_org.rst @@ -1,68 +1,75 @@ Source files ------------ -The source files of the LAMMPS code are found in two -directories of the distribution: ``src`` and ``lib``. -Most of the code is C++ but there are small numbers of files -in several other languages. +The source files of the LAMMPS code are found in two directories of the +distribution: ``src`` and ``lib``. Most of the code is written in C++ +but there are small a number of files in several other languages like C, +Fortran, Shell script, or Python. -The core of the code is located in the -``src`` folder and its sub-directories. -A sizable number of these files are in the ``src`` directory -itself, but there are plenty of :doc:`packages `, which can be -included or excluded when LAMMPS is built. See the :doc:`Include -packages in build ` section of the manual for more -information about that part of the build process. LAMMPS currently -supports building with :doc:`conventional makefiles ` and -through :doc:`CMake ` which differ in how packages are -enabled or disabled for a LAMMPS binary. The source files for each +The core of the code is located in the ``src`` folder and its +sub-directories. A sizable number of these files are in the ``src`` +directory itself, but there are plenty of :doc:`packages `, +which can be included or excluded when LAMMPS is built. See the +:doc:`Include packages in build ` section of the manual +for more information about that part of the build process. LAMMPS +currently supports building with :doc:`conventional makefiles +` and through :doc:`CMake `. Those procedures +differ in how packages are enabled or disabled for inclusion into a +LAMMPS binary so they cannot be mixed. The source files for each package are in all-uppercase sub-directories of the ``src`` folder, for example ``src/MOLECULE`` or ``src/USER-MISC``. The ``src/STUBS`` sub-directory is not a package but contains a dummy MPI library, that is used when building a serial version of the code. The ``src/MAKE`` -directory contains makefiles with settings and flags for a variety of -configuration and machines for the build process with traditional -makefiles. +directory and its sub-directories contain makefiles with settings and +flags for a variety of configuration and machines for the build process +with traditional makefiles. The ``lib`` directory contains the source code for several supporting libraries or files with configuration settings to use globally installed -libraries, that are required by some of the optional packages. -Each sub-directory, like ``lib/poems`` or ``lib/gpu``, contains the -source files, some of which are in different languages such as Fortran -or CUDA. These libraries are linked to during a LAMMPS build, if the -corresponding package is installed. +libraries, that are required by some of the optional packages. They may +include python scripts that can transparently download additional source +code on request. Each sub-directory, like ``lib/poems`` or ``lib/gpu``, +contains the source files, some of which are in different languages such +as Fortran or CUDA. These libraries included in the LAMMPS build, +if the corresponding package is installed. LAMMPS C++ source files almost always come in pairs, such as ``src/run.cpp`` (implementation file) and ``src/run.h`` (header file). -Each pair of files defines a C++ -class, for example the :cpp:class:`LAMMPS_NS::Run` class which contains -the code invoked by the :doc:`run ` command in a LAMMPS input script. -As this example illustrates, source file and class names often have a -one-to-one correspondence with a command used in a LAMMPS input script. -Some source files and classes do not have a corresponding input script +Each pair of files defines a C++ class, for example the +:cpp:class:`LAMMPS_NS::Run` class which contains the code invoked by the +:doc:`run ` command in a LAMMPS input script. As this example +illustrates, source file and class names often have a one-to-one +correspondence with a command used in a LAMMPS input script. Some +source files and classes do not have a corresponding input script command, e.g. ``src/force.cpp`` and the :cpp:class:`LAMMPS_NS::Force` class. They are discussed in the next section. -A small number of C++ classes and utility functions are implemented with -only a ``.h`` file. Examples are the Pointer class or the MathVec functions. +The names of all source files are in lower case and may use the +underscore character '_' to separate words. Outside of bundled libraries +which may have different conventions, all C and C++ header files have a +``.h`` extension, all C++ files have a ``.cpp`` extension, and C files a +``.c`` extension. A small number of C++ classes and utility functions +are implemented with only a ``.h`` file. Examples are the Pointer class +or the MathVec functions. Class topology -------------- Though LAMMPS has a lot of source files and classes, its class topology -is relative flat, as outlined in the :ref:`class-topology` figure. Each -name refers to a class and has a pair of associated source files in the -``src`` folder, for example the class :cpp:class:`LAMMPS_NS::Memory` -corresponds to the files ``memory.cpp`` and ``memory.h``, or the class -:cpp:class:`LAMMPS_NS::AtomVec` corresponds to the files -``atom_vec.cpp`` and ``atom_vec.h``. Full lines in the figure represent -compositing: that is the class to the left holds a pointer to an -instance of the class to the right. Dashed lines instead represent -inheritance: the class to the right is derived from the class on the -left. Classes with a red boundary are not instantiated directly, but -they represent the base classes for "styles". Those "styles" make up -the bulk of the LAMMPS code and only a few typical examples are included -in the figure for demonstration purposes. +is not very deep, which can be seen from the :ref:`class-topology` +figure. In that figure, each name refers to a class and has a pair of +associated source files in the ``src`` folder, for example the class +:cpp:class:`LAMMPS_NS::Memory` corresponds to the files ``memory.cpp`` +and ``memory.h``, or the class :cpp:class:`LAMMPS_NS::AtomVec` +corresponds to the files ``atom_vec.cpp`` and ``atom_vec.h``. Full +lines in the figure represent compositing: that is the class at the base +of the arrow holds a pointer to an instance of the class at the tip. +Dashed lines instead represent inheritance: the class to the tip of the +arrow is derived from the class at the base. Classes with a red boundary +are not instantiated directly, but they represent the base classes for +"styles". Those "styles" make up the bulk of the LAMMPS code and only +a few representative examples are included in the figure so it remains +readable. .. _class-topology: .. figure:: JPG/lammps-classes.png @@ -82,69 +89,76 @@ in the figure for demonstration purposes. derived classes, which may also hold instances of other classes. The :cpp:class:`LAMMPS_NS::LAMMPS` class is the topmost class and -represents what is referred to an "instance" of LAMMPS. It is a -composite holding references to instances of other core classes +represents what is generally referred to an "instance" of LAMMPS. It is +a composite holding pointers to instances of other core classes providing the core functionality of the MD engine in LAMMPS and through them abstractions of the required operations. The constructor of the LAMMPS class will instantiate those instances, process the command line flags, initialize MPI (if not already done) and set up file pointers for -input and output. The destructor will shut everything down and free all +input and output. The destructor will shut everything down and free all associated memory. Thus code for the standalone LAMMPS executable in ``main.cpp`` simply initializes MPI, instantiates a single instance of -LAMMPS, and passes it the command line flags and input script. It +LAMMPS while passing it the command line flags and input script. It deletes the LAMMPS instance after the method reading the input returns and shuts down the MPI environment before it exits the executable. The :cpp:class:`LAMMPS_NS::Pointers` is not shown in the -:ref:`class-topology` figure, it holds references to members of the -`LAMMPS_NS::LAMMPS`, so that all classes derived from -:cpp:class:`LAMMPS_NS::Pointers` have direct access to those reference. -From the class topology all classes with blue boundary are referenced in -this class and all classes in the second and third columns, that are not -listed as derived classes are instead derived from -:cpp:class:`LAMMPS_NS::Pointers`. +:ref:`class-topology` figure for clarity. It holds references to many +of the members of the `LAMMPS_NS::LAMMPS`, so that all classes derived +from :cpp:class:`LAMMPS_NS::Pointers` have direct access to those +reference. From the class topology all classes with blue boundary are +referenced in the Pointers class and all classes in the second and third +columns, that are not listed as derived classes are instead derived from +:cpp:class:`LAMMPS_NS::Pointers`. To initialize the pointer references +in Pointers, a pointer to the LAMMPS class instance needs to be passed +to the constructor and thus all constructors for classes derived from it +must do so and pass this pointer to the constructor for Pointers. -Since all storage is encapsulated, the LAMMPS class can also be -instantiated multiple times by a calling code, and that can be either -simultaneously or consecutively. When running in parallel with MPI, -care has to be taken, that suitable communicators are used to not -create conflicts between different instances. +Since all storage is supposed to be encapsulated (there are a few +exceptions), the LAMMPS class can also be instantiated multiple times by +a calling code. Outside of the aforementioned exceptions, those LAMMPS +instances can be used alternately. As of the time of this writing +(early 2021) LAMMPS is not yet sufficiently thread-safe for concurrent +execution. When running in parallel with MPI, care has to be taken, +that suitable copies of communicators are used to not create conflicts +between different instances. -The LAMMPS class currently holds instances of 19 classes representing -different core functionalities There are a handful of virtual parent -classes in LAMMPS that define what LAMMPS calls ``styles``. They are -shaded red in the :ref:`class-topology` figure. Each of these are +The LAMMPS class currently (early 2021) holds instances of 19 classes +representing the core functionality. There are a handful of virtual +parent classes in LAMMPS that define what LAMMPS calls ``styles``. They +are shaded red in the :ref:`class-topology` figure. Each of these are parents of a number of child classes that implement the interface defined by the parent class. There are two main categories of these ``styles``: some may only have one instance active at a time (e.g. atom, pair, bond, angle, dihedral, improper, kspace, comm) and there is a -dedicated pointer variable in the composite class that manages them. +dedicated pointer variable for each of them in the composite class. Setups that require a mix of different such styles have to use a -*hybrid* class that manages and forwards calls to the corresponding -sub-styles for the designated subset of atoms or data. or the composite -class may have lists of class instances, e.g. Modify handles lists of -compute and fix styles, while Output handles dumps class instances. +*hybrid* class that takes the place of the one allowed instance and then +manages and forwards calls to the corresponding sub-styles for the +designated subset of atoms or data. The composite class may also have +lists of class instances, e.g. Modify handles lists of compute and fix +styles, while Output handles a list of dump class instances. -The exception to this scheme are the ``command`` style classes. These -implement specific commands that can be invoked before, after, or between -runs or are commands which launch a simulation. For these an instance -of the class is created, its command() method called and then, after -completion, the class instance deleted. Examples for this are the -create_box, create_atoms, minimize, run, or velocity command styles. +The exception to this scheme are the ``command`` style classes. These +implement specific commands that can be invoked before, after, or in +between runs. For these an instance of the class is created, its +command() method called and then, after completion, the class instance +deleted. Examples for this are the create_box, create_atoms, minimize, +run, or velocity command styles. For all those ``styles`` certain naming conventions are employed: for -the fix nve command the class is called FixNVE and the files are +the fix nve command the class is called FixNVE and the source files are ``fix_nve.h`` and ``fix_nve.cpp``. Similarly for fix ave/time we have -FixAveTime and ``fix_ave_time.h`` and ``fix_ave_time.cpp``. Style names +FixAveTime and ``fix_ave_time.h`` and ``fix_ave_time.cpp``. Style names are lower case and without spaces or special characters. A suffix or -multiple appended with a forward slash '/' denotes a variant of the -corresponding class without the suffix. To connect the style name and -the class name, LAMMPS uses macros like the following ATOM\_CLASS, -PAIR\_CLASS, BOND\_CLASS, REGION\_CLASS, FIX\_CLASS, COMPUTE\_CLASS, -or DUMP\_CLASS in the corresponding header file. During compilation -files with the pattern ``style_name.h`` are created that contain include -statements including all headers of all styles of a given type that -are currently active (or "installed). +words are appended with a forward slash '/' which denotes a variant of +the corresponding class without the suffix. To connect the style name +and the class name, LAMMPS uses macros like: ``AtomStyle()``, +``PairStyle()``, ``BondStyle()``, ``RegionStyle()``, and so on in the +corresponding header file. During configuration or compilation files +with the pattern ``style_.h`` are created that consist of a list +of include statements including all headers of all styles of a given +type that are currently active (or "installed). More details on individual classes in the :ref:`class-topology` are as @@ -152,11 +166,11 @@ follows: - The Memory class handles allocation of all large vectors and arrays. -- The Error class prints all error and warning messages. +- The Error class prints all (terminal) error and warning messages. -- The Universe class sets up partitions of processors so that multiple - simulations can be run, each on a subset of the processors allocated - for a run, e.g. by the mpirun command. +- The Universe class sets up one or more partitions of processors so + that one or multiple simulations can be run, on the processors + allocated for a run, e.g. by the mpirun command. - The Input class reads and processes input input strings and files, stores variables, and invokes :doc:`commands `. @@ -241,7 +255,8 @@ follows: .. TODO section on "Spatial decomposition and parallel operations" .. diagram of 3d processor grid, brick vs. tiled. local vs. ghost .. atoms, 6-way communication with pack/unpack functions, -.. PBC as part of the communication +.. PBC as part of the communication, forward and reverse communication +.. rendezvous communication, ring communication. .. TODO section on "Fixes, Computes, and Variables" .. how and when data is computed and provided and how it is diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index bf1cca7413..2945420b5a 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -71,12 +71,21 @@ and parsing files or arguments. ---------- +.. doxygenfunction:: strdup + :project: progguide + .. doxygenfunction:: trim :project: progguide .. doxygenfunction:: trim_comment :project: progguide +.. doxygenfunction:: has_utf8 + :project: progguide + +.. doxygenfunction:: utf8_subst + :project: progguide + .. doxygenfunction:: count_words(const char *text) :project: progguide @@ -286,6 +295,50 @@ This code example should produce the following output: ---------- + +Argument parsing classes +--------------------------- + +The purpose of argument parsing classes it to simplify and unify how +arguments of commands in LAMMPS are parsed and to make abstractions of +repetitive tasks. + +The :cpp:class:`LAMMPS_NS::ArgInfo` class provides an abstraction +for parsing references to compute or fix styles or variables. These +would start with a "c\_", "f\_", "v\_" followed by the ID or name of +than instance and may be postfixed with one or two array indices +"[]" with numbers > 0. + +A typical code segment would look like this: + +.. code-block:: C++ + :caption: Usage example for ArgInfo class + + int nvalues = 0; + for (iarg = 0; iarg < nargnew; iarg++) { + ArgInfo argi(arg[iarg]); + + which[nvalues] = argi.get_type(); + argindex[nvalues] = argi.get_index1(); + ids[nvalues] = argi.copy_name(); + + if ((which[nvalues] == ArgInfo::UNKNOWN) + || (which[nvalues] == ArgInfo::NONE) + || (argi.get_dim() > 1)) + error->all(FLERR,"Illegal compute XXX command"); + + nvalues++; + } + +---------- + +.. doxygenclass:: LAMMPS_NS::ArgInfo + :project: progguide + :members: + + +---------- + File reader classes ------------------- diff --git a/doc/src/JPG/WF_LJ.jpg b/doc/src/JPG/WF_LJ.jpg new file mode 100644 index 0000000000..fdacb8e80a Binary files /dev/null and b/doc/src/JPG/WF_LJ.jpg differ diff --git a/doc/src/Python_install.rst b/doc/src/Python_install.rst index c12644bf4a..134d3e22d2 100644 --- a/doc/src/Python_install.rst +++ b/doc/src/Python_install.rst @@ -69,7 +69,7 @@ this. cd build # configure LAMMPS compilation - cmake -C cmake/presets/minimal.cmake -D BUILD_SHARED_LIBS=on \ + cmake -C ../cmake/presets/minimal.cmake -D BUILD_SHARED_LIBS=on \ -D LAMMPS_EXCEPTIONS=on -D PKG_PYTHON=on ../cmake # compile LAMMPS @@ -97,10 +97,12 @@ this. For a system-wide installation you need to set ``CMAKE_INSTALL_PREFIX`` to a system folder like ``/usr`` (or - ``/usr/local``). The installation step (**not** the + ``/usr/local``); the default is ``${HOME}/.local``. The + installation step for a system folder installation (**not** the configuration/compilation) needs to be done with superuser privilege, e.g. by using ``sudo cmake --install .``. The - installation folders will then by changed to: + installation folders will then be changed to (assuming ``/usr`` as + prefix): +------------------------+---------------------------------------------------------+-------------------------------------------------------------+ | File | Location | Notes | diff --git a/doc/src/Speed_kokkos.rst b/doc/src/Speed_kokkos.rst index e7724eb229..708678e537 100644 --- a/doc/src/Speed_kokkos.rst +++ b/doc/src/Speed_kokkos.rst @@ -26,6 +26,15 @@ task). These are Serial (MPI-only for CPUs and Intel Phi), OpenMP GPUs) and HIP (for AMD GPUs). You choose the mode at build time to produce an executable compatible with a specific hardware. +.. admonition:: C++14 support + :class: note + + Kokkos requires using a compiler that supports the c++14 standard. For + some compilers, it may be necessary to add a flag to enable c++14 support. + For example, the GNU compiler uses the -std=c++14 flag. For a list of + compilers that have been tested with the Kokkos library, see the Kokkos + `README `_. + .. admonition:: NVIDIA CUDA support :class: note diff --git a/doc/src/fix_bond_react.rst b/doc/src/fix_bond_react.rst index 97717f59fc..b995239d08 100644 --- a/doc/src/fix_bond_react.rst +++ b/doc/src/fix_bond_react.rst @@ -41,7 +41,7 @@ Syntax * template-ID(post-reacted) = ID of a molecule template containing post-reaction topology * map_file = name of file specifying corresponding atom-IDs in the pre- and post-reacted templates * zero or more individual keyword/value pairs may be appended to each react argument -* individual_keyword = *prob* or *max_rxn* or *stabilize_steps* or *custom_charges* +* individual_keyword = *prob* or *max_rxn* or *stabilize_steps* or *custom_charges* or *molecule* or *modify_create* .. parsed-literal:: @@ -59,6 +59,12 @@ Syntax off = allow both inter- and intramolecular reactions (default) inter = search for reactions between molecules with different IDs intra = search for reactions within the same molecule + *modify_create* keyword values + *fit* value = *all* or *fragmentID* + all = use all eligible atoms for create-atoms fit (default) + fragmentID = ID of molecule fragment used for create-atoms fit + *overlap* value = R + R = only insert atom/molecule if further than R from existing particles (distance units) Examples """""""" @@ -89,7 +95,9 @@ documentation. Topology changes are defined in pre- and post-reaction molecule templates and can include creation and deletion of bonds, angles, dihedrals, impropers, bond types, angle types, dihedral types, atom types, or atomic charges. In addition, reaction by-products or -other molecules can be identified and deleted. +other molecules can be identified and deleted. Finally, atoms can be +created and inserted at specific positions relative to the reaction +site. Fix bond/react does not use quantum mechanical (eg. fix qmmm) or pairwise bond-order potential (eg. Tersoff or AIREBO) methods to @@ -262,14 +270,14 @@ command page. The post-reacted molecule template contains a sample of the reaction site and its surrounding topology after the reaction has occurred. It -must contain the same number of atoms as the pre-reacted template. A -one-to-one correspondence between the atom IDs in the pre- and -post-reacted templates is specified in the map file as described -below. Note that during a reaction, an atom, bond, etc. type may -change to one that was previously not present in the simulation. These -new types must also be defined during the setup of a given simulation. -A discussion of correctly handling this is also provided on the -:doc:`molecule ` command page. +must contain the same number of atoms as the pre-reacted template +(unless there are created atoms). A one-to-one correspondence between +the atom IDs in the pre- and post-reacted templates is specified in +the map file as described below. Note that during a reaction, an atom, +bond, etc. type may change to one that was previously not present in +the simulation. These new types must also be defined during the setup +of a given simulation. A discussion of correctly handling this is also +provided on the :doc:`molecule ` command page. .. note:: @@ -283,7 +291,7 @@ A discussion of correctly handling this is also provided on the The map file is a text document with the following format: A map file has a header and a body. The header of map file the -contains one mandatory keyword and four optional keywords. The +contains one mandatory keyword and five optional keywords. The mandatory keyword is 'equivalences': .. parsed-literal:: @@ -296,11 +304,12 @@ The optional keywords are 'edgeIDs', 'deleteIDs', 'chiralIDs' and .. parsed-literal:: N *edgeIDs* = # of edge atoms N in the pre-reacted molecule template - N *deleteIDs* = # of atoms N that are specified for deletion - N *chiralIDs* = # of specified chiral centers N - N *constraints* = # of specified reaction constraints N + N *deleteIDs* = # of atoms N that are deleted + N *createIDs* = # of atoms N that are created + N *chiralIDs* = # of chiral centers N + N *constraints* = # of reaction constraints N -The body of the map file contains two mandatory sections and four +The body of the map file contains two mandatory sections and five optional sections. The first mandatory section begins with the keyword 'InitiatorIDs' and lists the two atom IDs of the initiator atom pair in the pre-reacted molecule template. The second mandatory section @@ -313,8 +322,10 @@ the keyword 'EdgeIDs' and lists the atom IDs of edge atoms in the pre-reacted molecule template. The second optional section begins with the keyword 'DeleteIDs' and lists the atom IDs of pre-reaction template atoms to delete. The third optional section begins with the +keyword 'CreateIDs' and lists the atom IDs of the post-reaction +template atoms to create. The fourth optional section begins with the keyword 'ChiralIDs' lists the atom IDs of chiral atoms whose -handedness should be enforced. The fourth optional section begins with +handedness should be enforced. The fifth optional section begins with the keyword 'Constraints' and lists additional criteria that must be satisfied in order for the reaction to occur. Currently, there are five types of constraints available, as discussed below: 'distance', @@ -353,6 +364,38 @@ A sample map file is given below: ---------- +A user-specified set of atoms can be deleted by listing their +pre-reaction template IDs in the DeleteIDs section. A deleted atom +must still be included in the post-reaction molecule template, in +which it cannot be bonded to an atom that is not deleted. In addition +to deleting unwanted reaction by-products, this feature can be used to +remove specific topologies, such as small rings, that may be otherwise +indistinguishable. + +Atoms can be created by listing their post-reaction template IDs in +the CreateIDs section. A created atom should not be included in the +pre-reaction template. The inserted positions of created atoms are +determined by the coordinates of the post-reaction template, after +optimal translation and rotation of the post-reaction template to the +reaction site (using a fit with atoms that are neither created nor +deleted). The *modify_create* keyword can be used to modify the +default behavior when creating atoms. The *modify_create* keyword has +two sub-keywords, *fit* and *overlap*. One or more of the sub-keywords +may be used after the *modify_create* keyword. The *fit* sub-keyword +can be used to specify which post-reaction atoms are used for the +optimal translation and rotation of the post-reaction template. The +*fragmentID* value of the *fit* sub-keyword must be the name of a +molecule fragment defined in the post-reaction :doc:`molecule +` template, and only atoms in this fragment are used for the +fit. Atoms are created only if no current atom in the simulation is +within a distance R of any created atom, including the effect of +periodic boundary conditions if applicable. R is defined by the +*overlap* sub-keyword. Note that the default value for R is 0.0, which +will allow atoms to strongly overlap if you are inserting where other +atoms are present. The velocity of each created atom is initialized in +a random direction with a magnitude calculated from the instantaneous +temperature of the reaction site. + The handedness of atoms that are chiral centers can be enforced by listing their IDs in the ChiralIDs section. A chiral atom must be bonded to four atoms with mutually different atom types. This feature @@ -528,15 +571,6 @@ the same molecule ID are considered for the reaction. A few other considerations: -Many reactions result in one or more atoms that are considered -unwanted by-products. Therefore, bond/react provides the option to -delete a user-specified set of atoms. These pre-reaction atoms are -identified in the map file. A deleted atom must still be included in -the post-reaction molecule template, in which it cannot be bonded to -an atom that is not deleted. In addition to deleting unwanted reaction -by-products, this feature can be used to remove specific topologies, -such as small rings, that may be otherwise indistinguishable. - Optionally, you can enforce additional behaviors on reacting atoms. For example, it may be beneficial to force reacting atoms to remain at a certain temperature. For this, you can use the internally-created @@ -610,7 +644,7 @@ Default """"""" The option defaults are stabilization = no, prob = 1.0, stabilize_steps = 60, -reset_mol_ids = yes, custom_charges = no, molecule = off +reset_mol_ids = yes, custom_charges = no, molecule = off, modify_create = no ---------- diff --git a/doc/src/fix_ti_spring.rst b/doc/src/fix_ti_spring.rst index d569f707af..7e598ec471 100644 --- a/doc/src/fix_ti_spring.rst +++ b/doc/src/fix_ti_spring.rst @@ -139,11 +139,14 @@ output `. The default setting for this fix is This fix computes a global scalar and a global vector quantities which can be accessed by various :doc:`output commands `. The -scalar is the sum of the spring energy for each atom, where the -per-atom energy is 0.5 \* k \* r\^2. The vector has 2 positions, the -first one is the coupling parameter lambda and the second one is the -time derivative of lambda. The scalar and vector values calculated by -this fix are "extensive". +scalar is an energy which is the sum of the spring energy for each +atom, where the per-atom energy is 0.5 \* k \* r\^2. The vector stores +2 values. The first value is the coupling parameter lambda. The +second value is the derivative of lambda with respect to the integer +timestep *s*, i.e. d lambda / ds. In order to obtain d lambda / dt, +where t is simulation time, this 2nd value needs to be divided by the +timestep size (e.g. 0.5 fs). The scalar and vector values calculated +by this fix are "extensive". No parameter of this fix can be used with the *start/stop* keywords of the :doc:`run ` command. diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst index a5fbb824f5..524581d2c4 100644 --- a/doc/src/pair_style.rst +++ b/doc/src/pair_style.rst @@ -327,6 +327,7 @@ accelerated styles exist. * :doc:`ufm ` - * :doc:`vashishta ` - Vashishta 2-body and 3-body potential * :doc:`vashishta/table ` - +* :doc:`wf/cut ` - Wang-Frenkel Potential for short-ranged interactions * :doc:`yukawa ` - Yukawa potential * :doc:`yukawa/colloid ` - screened Yukawa potential for finite-size particles * :doc:`zbl ` - Ziegler-Biersack-Littmark potential diff --git a/doc/src/pair_wf_cut.rst b/doc/src/pair_wf_cut.rst new file mode 100644 index 0000000000..e69b982d8a --- /dev/null +++ b/doc/src/pair_wf_cut.rst @@ -0,0 +1,117 @@ +.. index:: pair_style wf/cut + +pair_style wf/cut command +=========================== + +Syntax +"""""" + + +.. code-block:: LAMMPS + + pair_style wf/cut cutoff + +* cutoff = cutoff for wf interactions (distance units) + +Examples +"""""""" + + +.. code-block:: LAMMPS + + pair_style wf/cut 2.0 + pair_coeff 1 1 1.0 1.0 1 1 2.0 + +Description +""""""""""" + +The *wf/cut* (Wang-Frenkel) style computes LJ-like potentials as +described in :ref:`Wang2020 `. This potential is by +construction finite ranged and it vanishes quadratically at the cutoff +distance, avoiding truncation, shifting, interpolation and other typical +procedures with the LJ potential. The *wf/cut* can be used when a +typical short-ranged potential with attraction is required. The +potential is given by which is given by: + +.. math:: + \phi(r)= \epsilon \alpha \left(\left[{\sigma\over r}\right]^{2\mu} -1 \right)\left(\left[{r_c\over r}\right]^{2\mu}-1\right)^{2\nu} + +with + +.. math:: + \alpha=2\nu\left(\frac{r_c}{\sigma}\right)^{2\mu}\left[\frac{1+2\nu}{2\nu\left[(r_c/\sigma)^{2\mu}-1\right]}\right]^{2\nu+1} + +and + +.. math:: + r_{min}=r_c\left[\frac{1+2\nu}{1+2\nu(r_c/\sigma)^{2\nu}}\right]^{1/{2\nu}} + +:math:`r_c` is the cutoff. + +Comparison of the non-truncated Lennard-Jones 12-6 potential (red curve), +and the WF potentials with :math:`\mu=1` and :math:`\nu=1` are shown in +the figure below. The blue curve has :math:`r_c =2.0` and the green +curve has :math:`r_c =1.2` and can be used to describe colloidal +interactions. + +.. image:: JPG/WF_LJ.jpg + :align: center + :scale: 33% + + +The following coefficients must be defined for each pair of atoms +types via the :doc:`pair_coeff ` command as in the example +above, or in the data file or restart files read by the +:doc:`read_data ` or :doc:`read_restart ` +commands: + +* :math:`\epsilon` (energy units) +* :math:`\sigma` (distance units) +* :math:`\nu` +* :math:`\mu` +* :math:`r_c` (distance units) + +The last coefficient is optional. If not specified, the global cutoff +given in the pair_style command is used. The exponents :math:`\nu` and +:math:`\mu` are positive integers, usually set to 1. There is usually +little to be gained by choosing other values of :math:`\nu` and +:math:`\mu` (See discussion in :ref:`Wang2020 `) + +---------- + +**Mixing, shift, table, tail correction, restart, rRESPA info**\ : + +This pair style does not support the :doc:`pair_modify ` +mixing and table options. + +The :doc:`pair_modify ` tail option is not relevant +for this pair style as it goes to zero at the cut-off radius. + +This pair style writes its information to :doc:`binary restart files +`, so pair_style and pair_coeff commands do not need to be +specified in an input script that reads a restart file. + +This pair style does not support the use of the *inner*\ , *middle*\ , +and *outer* keywords of the :doc:`run_style respa ` command. + +---------- + +Restrictions +"""""""""""" +This pair style can only be used if LAMMPS was built with the +USER-MISC package. See the :doc:`Build package ` doc +page for more info. + +Related commands +"""""""""""""""" + +:doc:`pair_coeff ` + +**Default:** none + + +---------- + +.. _Wang2020: + +**(Wang2020)** X. Wang, S. Ramirez-Hinestrosa, J. Dobnikar, and D. Frenkel, Phys. Chem. Chem. Phys. 22, 10624 (2020). diff --git a/doc/src/thermo_modify.rst b/doc/src/thermo_modify.rst index f525aef79a..4439f7732c 100644 --- a/doc/src/thermo_modify.rst +++ b/doc/src/thermo_modify.rst @@ -96,9 +96,11 @@ always include a divide by the number of atoms in the variable formula if this is not the case. The *flush* keyword invokes a flush operation after thermodynamic info -is written to the log file. This insures the output in that file is -current (no buffering by the OS), even if LAMMPS halts before the -simulation completes. +is written to the screen and log file. This insures the output is +updated and not buffered (by the application) even if LAMMPS halts +before the simulation completes. Please note that this does not +affect buffering by the OS or devices, so you may still lose data +in case the simulation stops due to a hardware failure. The *line* keyword determines whether thermodynamics will be output as a series of numeric values on one line or in a multi-line format with diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 7e55fe3638..e9ba170ac5 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -692,6 +692,7 @@ dmi dnf DNi Dobson +Dobnikar Dodds docenv dodgerblue @@ -1232,6 +1233,7 @@ Hibbs Higdon Hijazi Hilger +Hinestrosa histo histogrammed histogramming @@ -2476,6 +2478,9 @@ Poresag pos Poschel posix +postfix +postfixed +postfixes Postma Potapkin potin @@ -3406,6 +3411,7 @@ WeinanE Wennberg Westmere Westview +wf wget Whelan whitesmoke diff --git a/examples/USER/misc/agni/log.20Jan21.adatom.g++.1 b/examples/USER/misc/agni/log.20Jan21.adatom.g++.1 new file mode 100644 index 0000000000..edd0402824 --- /dev/null +++ b/examples/USER/misc/agni/log.20Jan21.adatom.g++.1 @@ -0,0 +1,92 @@ +LAMMPS (24 Dec 2020) + using 1 OpenMP thread(s) per MPI task + +processors * * 1 +units metal +boundary p p f +read_data adatom.data +Reading data file ... + orthogonal box = (0.0000000 0.0000000 0.0000000) to (17.121441 14.827603 39.319732) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 181 atoms + read_data CPU = 0.002 seconds + +pair_style agni +pair_coeff * * Al_jpc.agni Al +Reading agni potential file Al_jpc.agni with DATE: 2017-02-24 +WARNING: Ignoring unknown tag 'Rs' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'neighbors' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'lambda' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) + +neighbor 0.3 bin +neigh_modify delay 2 check yes + +timestep 0.0005 +velocity all create 500 12345 +fix 1 all nvt temp 250 250 0.2 +fix 5 all momentum 1 linear 1 1 1 + +thermo 100 +thermo_style custom step ke temp + +# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz + +run 1000 +Neighbor list info ... + update every 1 steps, delay 2 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 8.3 + ghost atom cutoff = 8.3 + binsize = 4.15, bins = 5 4 10 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair agni, perpetual + attributes: full, newton on + pair build: full/bin/atomonly + stencil: full/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.072 | 3.072 | 3.072 Mbytes +Step KinEng Temp + 0 11.633413 500 + 100 4.6059941 197.96401 + 200 7.3700156 316.76068 + 300 6.0443915 259.78582 + 400 6.163119 264.88869 + 500 6.2647284 269.25582 + 600 5.2732533 226.64257 + 700 5.651448 242.89725 + 800 6.5572404 281.82788 + 900 6.0576743 260.35671 + 1000 6.5622234 282.04205 +Loop time of 16.4158 on 1 procs for 1000 steps with 181 atoms + +Performance: 2.632 ns/day, 9.120 hours/ns, 60.917 timesteps/s +97.6% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 16.395 | 16.395 | 16.395 | 0.0 | 99.88 +Neigh | 0.013275 | 0.013275 | 0.013275 | 0.0 | 0.08 +Comm | 0.0023484 | 0.0023484 | 0.0023484 | 0.0 | 0.01 +Output | 0.00014842 | 0.00014842 | 0.00014842 | 0.0 | 0.00 +Modify | 0.0035522 | 0.0035522 | 0.0035522 | 0.0 | 0.02 +Other | | 0.001173 | | | 0.01 + +Nlocal: 181.000 ave 181 max 181 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 563.000 ave 563 max 563 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +FullNghs: 19484.0 ave 19484 max 19484 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 19484 +Ave neighs/atom = 107.64641 +Neighbor list builds = 33 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:00:16 diff --git a/examples/USER/misc/agni/log.20Jan21.adatom.g++.4 b/examples/USER/misc/agni/log.20Jan21.adatom.g++.4 new file mode 100644 index 0000000000..496f0580fd --- /dev/null +++ b/examples/USER/misc/agni/log.20Jan21.adatom.g++.4 @@ -0,0 +1,92 @@ +LAMMPS (24 Dec 2020) + using 1 OpenMP thread(s) per MPI task + +processors * * 1 +units metal +boundary p p f +read_data adatom.data +Reading data file ... + orthogonal box = (0.0000000 0.0000000 0.0000000) to (17.121441 14.827603 39.319732) + 2 by 2 by 1 MPI processor grid + reading atoms ... + 181 atoms + read_data CPU = 0.001 seconds + +pair_style agni +pair_coeff * * Al_jpc.agni Al +Reading agni potential file Al_jpc.agni with DATE: 2017-02-24 +WARNING: Ignoring unknown tag 'Rs' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'neighbors' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'lambda' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) + +neighbor 0.3 bin +neigh_modify delay 2 check yes + +timestep 0.0005 +velocity all create 500 12345 +fix 1 all nvt temp 250 250 0.2 +fix 5 all momentum 1 linear 1 1 1 + +thermo 100 +thermo_style custom step ke temp + +# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz + +run 1000 +Neighbor list info ... + update every 1 steps, delay 2 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 8.3 + ghost atom cutoff = 8.3 + binsize = 4.15, bins = 5 4 10 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair agni, perpetual + attributes: full, newton on + pair build: full/bin/atomonly + stencil: full/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.063 | 3.063 | 3.063 Mbytes +Step KinEng Temp + 0 11.633413 500 + 100 4.6059939 197.964 + 200 7.3700154 316.76067 + 300 6.0443914 259.78582 + 400 6.1631193 264.8887 + 500 6.2647281 269.25581 + 600 5.273254 226.6426 + 700 5.6514484 242.89726 + 800 6.5572409 281.82791 + 900 6.0576737 260.35668 + 1000 6.5622233 282.04205 +Loop time of 4.67437 on 4 procs for 1000 steps with 181 atoms + +Performance: 9.242 ns/day, 2.597 hours/ns, 213.933 timesteps/s +98.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 4.0668 | 4.2058 | 4.4078 | 7.1 | 89.98 +Neigh | 0.0033048 | 0.0033794 | 0.0034381 | 0.1 | 0.07 +Comm | 0.2547 | 0.45656 | 0.59576 | 21.5 | 9.77 +Output | 9.8817e-05 | 0.00035464 | 0.001121 | 0.0 | 0.01 +Modify | 0.0059429 | 0.0060754 | 0.0061966 | 0.2 | 0.13 +Other | | 0.002172 | | | 0.05 + +Nlocal: 45.2500 ave 52 max 40 min +Histogram: 1 0 0 1 1 0 0 0 0 1 +Nghost: 376.500 ave 382 max 366 min +Histogram: 1 0 0 0 0 0 0 0 2 1 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +FullNghs: 4871.00 ave 5578 max 4374 min +Histogram: 1 0 1 1 0 0 0 0 0 1 + +Total # of neighbors = 19484 +Ave neighs/atom = 107.64641 +Neighbor list builds = 33 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:00:04 diff --git a/examples/USER/misc/agni/log.20Jan21.vacancy.g++.1 b/examples/USER/misc/agni/log.20Jan21.vacancy.g++.1 new file mode 100644 index 0000000000..d2f97b8b4c --- /dev/null +++ b/examples/USER/misc/agni/log.20Jan21.vacancy.g++.1 @@ -0,0 +1,92 @@ +LAMMPS (24 Dec 2020) + using 1 OpenMP thread(s) per MPI task + +units metal +boundary p p p +read_data vacancy.data +Reading data file ... + orthogonal box = (0.0000000 0.0000000 0.0000000) to (8.0711250 8.0711250 8.0711250) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 31 atoms + read_data CPU = 0.002 seconds + +pair_style agni +pair_coeff * * Al_jpc.agni Al +Reading agni potential file Al_jpc.agni with DATE: 2017-02-24 +WARNING: Ignoring unknown tag 'Rs' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'neighbors' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'lambda' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +neighbor 0.3 bin +neigh_modify delay 2 check yes + + +timestep 0.0005 +velocity all create 1000 12345 +fix 1 all nvt temp 900 900 200 +fix 5 all momentum 1 linear 1 1 1 + + +thermo 100 +thermo_style custom step ke etotal temp + +dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz + +run 1000 +Neighbor list info ... + update every 1 steps, delay 2 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 8.3 + ghost atom cutoff = 8.3 + binsize = 4.15, bins = 2 2 2 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair agni, perpetual + attributes: full, newton on + pair build: full/bin/atomonly + stencil: full/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 4.241 | 4.241 | 4.241 Mbytes +Step KinEng TotEng Temp + 0 3.8778043 3.8778043 1000 + 100 2.9986261 2.9986261 773.27936 + 200 3.6860313 3.6860313 950.54598 + 300 3.8133153 3.8133153 983.3697 + 400 3.7330285 3.7330285 962.6655 + 500 3.5875467 3.5875467 925.14897 + 600 3.533152 3.533152 911.12178 + 700 2.6509457 2.6509457 683.62028 + 800 3.376349 3.376349 870.68576 + 900 3.9036736 3.9036736 1006.6711 + 1000 3.0884833 3.0884833 796.45156 +Loop time of 2.92678 on 1 procs for 1000 steps with 31 atoms + +Performance: 14.760 ns/day, 1.626 hours/ns, 341.673 timesteps/s +97.5% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 2.9144 | 2.9144 | 2.9144 | 0.0 | 99.58 +Neigh | 0.0068263 | 0.0068263 | 0.0068263 | 0.0 | 0.23 +Comm | 0.0029868 | 0.0029868 | 0.0029868 | 0.0 | 0.10 +Output | 0.00050202 | 0.00050202 | 0.00050202 | 0.0 | 0.02 +Modify | 0.0013382 | 0.0013382 | 0.0013382 | 0.0 | 0.05 +Other | | 0.0007672 | | | 0.03 + +Nlocal: 31.0000 ave 31 max 31 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 869.000 ave 869 max 869 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +FullNghs: 4360.00 ave 4360 max 4360 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 4360 +Ave neighs/atom = 140.64516 +Neighbor list builds = 53 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:00:02 diff --git a/examples/USER/misc/agni/log.20Jan21.vacancy.g++.4 b/examples/USER/misc/agni/log.20Jan21.vacancy.g++.4 new file mode 100644 index 0000000000..79fa2ace57 --- /dev/null +++ b/examples/USER/misc/agni/log.20Jan21.vacancy.g++.4 @@ -0,0 +1,92 @@ +LAMMPS (24 Dec 2020) + using 1 OpenMP thread(s) per MPI task + +units metal +boundary p p p +read_data vacancy.data +Reading data file ... + orthogonal box = (0.0000000 0.0000000 0.0000000) to (8.0711250 8.0711250 8.0711250) + 1 by 2 by 2 MPI processor grid + reading atoms ... + 31 atoms + read_data CPU = 0.001 seconds + +pair_style agni +pair_coeff * * Al_jpc.agni Al +Reading agni potential file Al_jpc.agni with DATE: 2017-02-24 +WARNING: Ignoring unknown tag 'Rs' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'neighbors' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +WARNING: Ignoring unknown tag 'lambda' in AGNI potential file. (src/USER-MISC/pair_agni.cpp:440) +neighbor 0.3 bin +neigh_modify delay 2 check yes + + +timestep 0.0005 +velocity all create 1000 12345 +fix 1 all nvt temp 900 900 200 +fix 5 all momentum 1 linear 1 1 1 + + +thermo 100 +thermo_style custom step ke etotal temp + +dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz + +run 1000 +Neighbor list info ... + update every 1 steps, delay 2 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 8.3 + ghost atom cutoff = 8.3 + binsize = 4.15, bins = 2 2 2 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair agni, perpetual + attributes: full, newton on + pair build: full/bin/atomonly + stencil: full/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 4.227 | 4.227 | 4.227 Mbytes +Step KinEng TotEng Temp + 0 3.8778043 3.8778043 1000 + 100 2.9986264 2.9986264 773.27944 + 200 3.6860316 3.6860316 950.54606 + 300 3.8133152 3.8133152 983.36966 + 400 3.7330288 3.7330288 962.66559 + 500 3.5875468 3.5875468 925.149 + 600 3.5331519 3.5331519 911.12176 + 700 2.6509452 2.6509452 683.62015 + 800 3.3763492 3.3763492 870.68579 + 900 3.9036736 3.9036736 1006.6711 + 1000 3.0884821 3.0884821 796.45125 +Loop time of 0.91769 on 4 procs for 1000 steps with 31 atoms + +Performance: 47.075 ns/day, 0.510 hours/ns, 1089.693 timesteps/s +95.2% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.67405 | 0.76047 | 0.81748 | 6.1 | 82.87 +Neigh | 0.0015411 | 0.001691 | 0.001774 | 0.2 | 0.18 +Comm | 0.091364 | 0.14959 | 0.23513 | 13.8 | 16.30 +Output | 0.00027996 | 0.00040391 | 0.00075917 | 0.0 | 0.04 +Modify | 0.0028397 | 0.0039247 | 0.0050072 | 1.7 | 0.43 +Other | | 0.001611 | | | 0.18 + +Nlocal: 7.75000 ave 8 max 7 min +Histogram: 1 0 0 0 0 0 0 0 0 3 +Nghost: 617.250 ave 621 max 612 min +Histogram: 1 0 0 0 0 0 2 0 0 1 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +FullNghs: 1090.00 ave 1131 max 993 min +Histogram: 1 0 0 0 0 0 0 0 1 2 + +Total # of neighbors = 4360 +Ave neighs/atom = 140.64516 +Neighbor list builds = 53 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:00:00 diff --git a/examples/USER/misc/agni/log.21Feb17.adatom.g++.1 b/examples/USER/misc/agni/log.21Feb17.adatom.g++.1 deleted file mode 100644 index 5243b83ec9..0000000000 --- a/examples/USER/misc/agni/log.21Feb17.adatom.g++.1 +++ /dev/null @@ -1,87 +0,0 @@ -LAMMPS (21 Feb 2017) - using 1 OpenMP thread(s) per MPI task - -processors * * 1 -units metal -boundary p p f -read_data adatom.data - orthogonal box = (0 0 0) to (17.1214 14.8276 39.3197) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 181 atoms - -pair_style agni -pair_coeff * * Al_jpc.agni Al -Reading potential file Al_jpc.agni with DATE: 2017-02-24 - -neighbor 0.3 bin -neigh_modify delay 2 check yes - -timestep 0.0005 -velocity all create 500 12345 -fix 1 all nvt temp 250 250 0.2 -fix 5 all momentum 1 linear 1 1 1 - -thermo 100 -thermo_style custom step ke temp - -# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz - -run 1000 -Neighbor list info ... - update every 1 steps, delay 2 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 8.3 - ghost atom cutoff = 8.3 - binsize = 4.15, bins = 5 4 10 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair agni, perpetual - attributes: full, newton on - pair build: full/bin/atomonly - stencil: full/bin/3d - bin: standard -Memory usage per processor = 2.69795 Mbytes -Step KinEng Temp - 0 11.633413 500 - 100 4.6059939 197.964 - 200 7.3700149 316.76065 - 300 6.0443913 259.78581 - 400 6.1631189 264.88868 - 500 6.2647272 269.25577 - 600 5.2732539 226.6426 - 700 5.6514471 242.89721 - 800 6.5572407 281.8279 - 900 6.0576738 260.35669 - 1000 6.5622233 282.04205 -Loop time of 51.9308 on 1 procs for 1000 steps with 181 atoms - -Performance: 0.832 ns/day, 28.850 hours/ns, 19.256 timesteps/s -99.4% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 51.89 | 51.89 | 51.89 | 0.0 | 99.92 -Neigh | 0.023158 | 0.023158 | 0.023158 | 0.0 | 0.04 -Comm | 0.0049036 | 0.0049036 | 0.0049036 | 0.0 | 0.01 -Output | 0.0002594 | 0.0002594 | 0.0002594 | 0.0 | 0.00 -Modify | 0.010244 | 0.010244 | 0.010244 | 0.0 | 0.02 -Other | | 0.002483 | | | 0.00 - -Nlocal: 181 ave 181 max 181 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 563 ave 563 max 563 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 0 ave 0 max 0 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -FullNghs: 19484 ave 19484 max 19484 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 19484 -Ave neighs/atom = 107.646 -Neighbor list builds = 33 -Dangerous builds = 0 - -Please see the log.cite file for references relevant to this simulation - -Total wall time: 0:00:52 diff --git a/examples/USER/misc/agni/log.21Feb17.adatom.g++.4 b/examples/USER/misc/agni/log.21Feb17.adatom.g++.4 deleted file mode 100644 index 9419888e98..0000000000 --- a/examples/USER/misc/agni/log.21Feb17.adatom.g++.4 +++ /dev/null @@ -1,87 +0,0 @@ -LAMMPS (21 Feb 2017) - using 1 OpenMP thread(s) per MPI task - -processors * * 1 -units metal -boundary p p f -read_data adatom.data - orthogonal box = (0 0 0) to (17.1214 14.8276 39.3197) - 2 by 2 by 1 MPI processor grid - reading atoms ... - 181 atoms - -pair_style agni -pair_coeff * * Al_jpc.agni Al -Reading potential file Al_jpc.agni with DATE: 2017-02-24 - -neighbor 0.3 bin -neigh_modify delay 2 check yes - -timestep 0.0005 -velocity all create 500 12345 -fix 1 all nvt temp 250 250 0.2 -fix 5 all momentum 1 linear 1 1 1 - -thermo 100 -thermo_style custom step ke temp - -# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz - -run 1000 -Neighbor list info ... - update every 1 steps, delay 2 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 8.3 - ghost atom cutoff = 8.3 - binsize = 4.15, bins = 5 4 10 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair agni, perpetual - attributes: full, newton on - pair build: full/bin/atomonly - stencil: full/bin/3d - bin: standard -Memory usage per processor = 3.06041 Mbytes -Step KinEng Temp - 0 11.633413 500 - 100 4.6059941 197.96401 - 200 7.3700154 316.76067 - 300 6.0443913 259.78581 - 400 6.1631193 264.8887 - 500 6.2647281 269.25581 - 600 5.2732537 226.64259 - 700 5.651448 242.89725 - 800 6.5572405 281.82789 - 900 6.0576741 260.3567 - 1000 6.562224 282.04208 -Loop time of 14.5263 on 4 procs for 1000 steps with 181 atoms - -Performance: 2.974 ns/day, 8.070 hours/ns, 68.841 timesteps/s -99.3% CPU use with 4 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 12.858 | 13.227 | 13.574 | 8.0 | 91.06 -Neigh | 0.0056965 | 0.0058173 | 0.0060787 | 0.2 | 0.04 -Comm | 0.92934 | 1.276 | 1.6455 | 25.7 | 8.78 -Output | 0.00013971 | 0.00017625 | 0.00025463 | 0.0 | 0.00 -Modify | 0.012693 | 0.012756 | 0.012911 | 0.1 | 0.09 -Other | | 0.004066 | | | 0.03 - -Nlocal: 45.25 ave 52 max 40 min -Histogram: 1 0 0 1 1 0 0 0 0 1 -Nghost: 376.5 ave 382 max 366 min -Histogram: 1 0 0 0 0 0 0 0 2 1 -Neighs: 0 ave 0 max 0 min -Histogram: 4 0 0 0 0 0 0 0 0 0 -FullNghs: 4871 ave 5578 max 4374 min -Histogram: 1 0 1 1 0 0 0 0 0 1 - -Total # of neighbors = 19484 -Ave neighs/atom = 107.646 -Neighbor list builds = 33 -Dangerous builds = 0 - -Please see the log.cite file for references relevant to this simulation - -Total wall time: 0:00:14 diff --git a/examples/USER/misc/agni/log.21Feb17.vacancy.g++.1 b/examples/USER/misc/agni/log.21Feb17.vacancy.g++.1 deleted file mode 100644 index 9194d78419..0000000000 --- a/examples/USER/misc/agni/log.21Feb17.vacancy.g++.1 +++ /dev/null @@ -1,87 +0,0 @@ -LAMMPS (21 Feb 2017) - using 1 OpenMP thread(s) per MPI task - -units metal -boundary p p p -read_data vacancy.data - orthogonal box = (0 0 0) to (8.07113 8.07113 8.07113) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 31 atoms - -pair_style agni -pair_coeff * * Al_jpc.agni Al -Reading potential file Al_jpc.agni with DATE: 2017-02-24 -neighbor 0.3 bin -neigh_modify delay 2 check yes - - -timestep 0.0005 -velocity all create 1000 12345 -fix 1 all nvt temp 900 900 200 -fix 5 all momentum 1 linear 1 1 1 - - -thermo 100 -thermo_style custom step ke etotal temp - -# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz - -run 1000 -Neighbor list info ... - update every 1 steps, delay 2 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 8.3 - ghost atom cutoff = 8.3 - binsize = 4.15, bins = 2 2 2 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair agni, perpetual - attributes: full, newton on - pair build: full/bin/atomonly - stencil: full/bin/3d - bin: standard -Memory usage per processor = 2.73416 Mbytes -Step KinEng TotEng Temp - 0 3.8778043 3.8778043 1000 - 100 2.9986261 2.9986261 773.27937 - 200 3.6860314 3.6860314 950.54599 - 300 3.813315 3.813315 983.36961 - 400 3.7330285 3.7330285 962.6655 - 500 3.5875467 3.5875467 925.14896 - 600 3.5331529 3.5331529 911.12202 - 700 2.6509449 2.6509449 683.62008 - 800 3.3763492 3.3763492 870.68582 - 900 3.903673 3.903673 1006.6709 - 1000 3.0884824 3.0884824 796.45133 -Loop time of 9.02712 on 1 procs for 1000 steps with 31 atoms - -Performance: 4.786 ns/day, 5.015 hours/ns, 110.777 timesteps/s -99.4% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 9.0039 | 9.0039 | 9.0039 | 0.0 | 99.74 -Neigh | 0.011892 | 0.011892 | 0.011892 | 0.0 | 0.13 -Comm | 0.0061693 | 0.0061693 | 0.0061693 | 0.0 | 0.07 -Output | 0.00014615 | 0.00014615 | 0.00014615 | 0.0 | 0.00 -Modify | 0.0035009 | 0.0035009 | 0.0035009 | 0.0 | 0.04 -Other | | 0.001521 | | | 0.02 - -Nlocal: 31 ave 31 max 31 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 869 ave 869 max 869 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 0 ave 0 max 0 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -FullNghs: 4360 ave 4360 max 4360 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 4360 -Ave neighs/atom = 140.645 -Neighbor list builds = 53 -Dangerous builds = 0 - -Please see the log.cite file for references relevant to this simulation - -Total wall time: 0:00:09 diff --git a/examples/USER/misc/agni/log.21Feb17.vacancy.g++.4 b/examples/USER/misc/agni/log.21Feb17.vacancy.g++.4 deleted file mode 100644 index d5d8e3a481..0000000000 --- a/examples/USER/misc/agni/log.21Feb17.vacancy.g++.4 +++ /dev/null @@ -1,87 +0,0 @@ -LAMMPS (21 Feb 2017) - using 1 OpenMP thread(s) per MPI task - -units metal -boundary p p p -read_data vacancy.data - orthogonal box = (0 0 0) to (8.07113 8.07113 8.07113) - 1 by 2 by 2 MPI processor grid - reading atoms ... - 31 atoms - -pair_style agni -pair_coeff * * Al_jpc.agni Al -Reading potential file Al_jpc.agni with DATE: 2017-02-24 -neighbor 0.3 bin -neigh_modify delay 2 check yes - - -timestep 0.0005 -velocity all create 1000 12345 -fix 1 all nvt temp 900 900 200 -fix 5 all momentum 1 linear 1 1 1 - - -thermo 100 -thermo_style custom step ke etotal temp - -# dump MyDump all custom 250 dump.atoms id type x y z vx vy vz fx fy fz - -run 1000 -Neighbor list info ... - update every 1 steps, delay 2 steps, check yes - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 8.3 - ghost atom cutoff = 8.3 - binsize = 4.15, bins = 2 2 2 - 1 neighbor lists, perpetual/occasional/extra = 1 0 0 - (1) pair agni, perpetual - attributes: full, newton on - pair build: full/bin/atomonly - stencil: full/bin/3d - bin: standard -Memory usage per processor = 2.72175 Mbytes -Step KinEng TotEng Temp - 0 3.8778044 3.8778044 1000 - 100 2.9986263 2.9986263 773.27942 - 200 3.6860315 3.6860315 950.54602 - 300 3.8133145 3.8133145 983.3695 - 400 3.7330282 3.7330282 962.66543 - 500 3.5875466 3.5875466 925.14895 - 600 3.5331523 3.5331523 911.12186 - 700 2.6509448 2.6509448 683.62005 - 800 3.3763493 3.3763493 870.68584 - 900 3.9036733 3.9036733 1006.671 - 1000 3.0884818 3.0884818 796.45116 -Loop time of 2.46785 on 4 procs for 1000 steps with 31 atoms - -Performance: 17.505 ns/day, 1.371 hours/ns, 405.212 timesteps/s -99.2% CPU use with 4 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 2.0737 | 2.299 | 2.3902 | 8.7 | 93.16 -Neigh | 0.0025222 | 0.0027327 | 0.0028174 | 0.2 | 0.11 -Comm | 0.059817 | 0.15141 | 0.37684 | 33.8 | 6.14 -Output | 0.0001502 | 0.00016767 | 0.00021219 | 0.0 | 0.01 -Modify | 0.0098755 | 0.010248 | 0.010664 | 0.3 | 0.42 -Other | | 0.004321 | | | 0.18 - -Nlocal: 7.75 ave 8 max 7 min -Histogram: 1 0 0 0 0 0 0 0 0 3 -Nghost: 617.25 ave 621 max 612 min -Histogram: 1 0 0 0 0 0 2 0 0 1 -Neighs: 0 ave 0 max 0 min -Histogram: 4 0 0 0 0 0 0 0 0 0 -FullNghs: 1090 ave 1131 max 993 min -Histogram: 1 0 0 0 0 0 0 0 1 2 - -Total # of neighbors = 4360 -Ave neighs/atom = 140.645 -Neighbor list builds = 53 -Dangerous builds = 0 - -Please see the log.cite file for references relevant to this simulation - -Total wall time: 0:00:02 diff --git a/examples/USER/reaction/create_atoms_polystyrene/grow_styrene.map b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene.map new file mode 100644 index 0000000000..88d282690c --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene.map @@ -0,0 +1,66 @@ +map file: styrene growth + +1 edgeIDs +30 equivalences +16 createIDs + +InitiatorIDs + +4 +13 + +EdgeIDs + +30 + +CreateIDs + +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 + +Equivalences + +1 45 +2 46 +3 44 +4 43 +5 42 +6 41 +7 40 +8 39 +9 38 +10 37 +11 36 +12 35 +13 34 +14 33 +15 32 +16 31 +17 17 +18 18 +19 19 +20 20 +21 21 +22 22 +23 23 +24 24 +25 25 +26 26 +27 27 +28 28 +29 29 +30 30 diff --git a/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_post.data_template b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_post.data_template new file mode 100755 index 0000000000..de0c2383bb --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_post.data_template @@ -0,0 +1,456 @@ +molecule template: end of chain plus polymerized styrene + +46 atoms +48 bonds +81 angles +121 dihedrals +35 impropers +1 fragments + +Fragments + +create_fit 34 44 + +Types + +1 1 +2 2 +3 1 +4 5 +5 1 +6 2 +7 1 +8 2 +9 1 +10 2 +11 1 +12 2 +13 2 +14 6 +15 2 +16 2 +17 1 +18 2 +19 1 +20 5 +21 1 +22 2 +23 1 +24 2 +25 1 +26 2 +27 1 +28 2 +29 2 +30 6 +31 1 +32 2 +33 1 +34 5 +35 1 +36 2 +37 1 +38 2 +39 1 +40 2 +41 1 +42 2 +43 2 +44 6 +45 2 +46 2 + +Charges + +1 -0.129000 +2 0.123700 +3 0.026600 +4 -0.018200 +5 -0.129000 +6 0.123700 +7 -0.173400 +8 0.140300 +9 -0.113400 +10 0.128800 +11 -0.173400 +12 0.140300 +13 0.051600 +14 -0.069600 +15 0.035400 +16 0.035400 +17 -0.129000 +18 0.123700 +19 0.026600 +20 -0.018200 +21 -0.129000 +22 0.123700 +23 -0.173400 +24 0.140300 +25 -0.113400 +26 0.128800 +27 -0.173400 +28 0.140300 +29 0.051600 +30 -0.069600 +31 -0.129000 +32 0.123700 +33 0.026600 +34 -0.018200 +35 -0.129000 +36 0.123700 +37 -0.173400 +38 0.140300 +39 -0.113400 +40 0.128800 +41 -0.173400 +42 0.140300 +43 0.051600 +44 -0.069600 +45 0.035400 +46 0.035400 + +Coords + +1 24.130699 1.043900 -1.309300 +2 25.062700 1.582900 -1.309300 +3 22.900700 1.753900 -1.309300 +4 22.900700 3.253900 -1.309300 +5 21.670700 1.043900 -1.309300 +6 20.738701 1.582900 -1.309300 +7 21.670700 -0.376100 -1.309300 +8 20.738701 -0.915100 -1.309300 +9 22.900700 -1.086100 -1.309300 +10 22.900700 -2.163100 -1.309300 +11 24.130699 -0.376100 -1.309300 +12 25.062700 -0.915100 -1.309300 +13 23.766701 3.658900 -0.952300 +14 21.622700 3.802900 -1.871300 +15 21.672701 4.544900 -1.970300 +16 20.979700 2.979900 -2.165300 +17 13.465800 0.682500 -1.658900 +18 14.397800 1.221500 -1.658900 +19 12.235800 1.392500 -1.658900 +20 12.235800 2.892500 -1.658900 +21 11.005800 0.682500 -1.658900 +22 10.073800 1.221500 -1.658900 +23 11.005800 -0.737500 -1.658900 +24 10.073800 -1.276500 -1.658900 +25 12.235800 -1.447500 -1.658900 +26 12.235800 -2.524500 -1.658900 +27 13.465800 -0.737500 -1.658900 +28 14.397800 -1.276500 -1.658900 +29 13.101800 3.297500 -1.301900 +30 10.957800 3.441500 -2.220900 +31 18.663500 0.855500 -1.372100 +32 19.595501 1.394500 -1.372100 +33 17.433500 1.565500 -1.372100 +34 17.433500 3.065500 -1.372100 +35 16.203501 0.855500 -1.372100 +36 15.271500 1.394500 -1.372100 +37 16.203501 -0.564500 -1.372100 +38 15.271500 -1.103500 -1.372100 +39 17.433500 -1.274500 -1.372100 +40 17.433500 -2.351500 -1.372100 +41 18.663500 -0.564500 -1.372100 +42 19.595501 -1.103500 -1.372100 +43 18.299500 3.470500 -1.015100 +44 16.155500 3.614500 -1.934100 +45 16.205500 4.356500 -2.033100 +46 15.512500 2.791500 -2.228100 + +Bonds + +1 1 1 2 +2 2 1 3 +3 2 1 11 +4 11 3 4 +5 2 3 5 +6 12 13 4 +7 13 4 14 +8 1 5 6 +9 2 5 7 +10 1 7 8 +11 2 7 9 +12 1 9 10 +13 2 9 11 +14 1 11 12 +15 10 15 14 +16 10 16 14 +17 9 14 34 +18 1 17 18 +19 2 17 19 +20 2 17 27 +21 7 19 20 +22 2 19 21 +23 8 29 20 +24 9 30 20 +25 9 44 20 +26 1 21 22 +27 2 21 23 +28 1 23 24 +29 2 23 25 +30 1 25 26 +31 2 25 27 +32 1 27 28 +33 1 31 32 +34 2 31 33 +35 2 31 41 +36 7 33 34 +37 2 33 35 +38 8 43 34 +39 9 44 34 +40 1 35 36 +41 2 35 37 +42 1 37 38 +43 2 37 39 +44 1 39 40 +45 2 39 41 +46 1 41 42 +47 10 45 44 +48 10 46 44 + +Angles + +1 1 3 1 2 +2 1 11 1 2 +3 2 3 1 11 +4 17 1 3 4 +5 2 1 3 5 +6 17 5 3 4 +7 18 3 4 13 +8 19 3 4 14 +9 20 13 4 14 +10 1 3 5 6 +11 2 3 5 7 +12 1 7 5 6 +13 1 5 7 8 +14 2 5 7 9 +15 1 9 7 8 +16 1 7 9 10 +17 2 7 9 11 +18 1 11 9 10 +19 2 1 11 9 +20 1 1 11 12 +21 1 9 11 12 +22 21 15 14 4 +23 21 16 14 4 +24 22 4 14 34 +25 15 15 14 16 +26 14 15 14 34 +27 14 16 14 34 +28 1 19 17 18 +29 1 27 17 18 +30 2 19 17 27 +31 9 17 19 20 +32 2 17 19 21 +33 9 21 19 20 +34 10 19 20 29 +35 11 19 20 30 +36 11 19 20 44 +37 12 29 20 30 +38 12 29 20 44 +39 13 30 20 44 +40 1 19 21 22 +41 2 19 21 23 +42 1 23 21 22 +43 1 21 23 24 +44 2 21 23 25 +45 1 25 23 24 +46 1 23 25 26 +47 2 23 25 27 +48 1 27 25 26 +49 2 17 27 25 +50 1 17 27 28 +51 1 25 27 28 +52 1 33 31 32 +53 1 41 31 32 +54 2 33 31 41 +55 9 31 33 34 +56 2 31 33 35 +57 9 35 33 34 +58 11 33 34 14 +59 12 43 34 14 +60 13 14 34 44 +61 10 33 34 43 +62 11 33 34 44 +63 12 43 34 44 +64 1 33 35 36 +65 2 33 35 37 +66 1 37 35 36 +67 1 35 37 38 +68 2 35 37 39 +69 1 39 37 38 +70 1 37 39 40 +71 2 37 39 41 +72 1 41 39 40 +73 2 31 41 39 +74 1 31 41 42 +75 1 39 41 42 +76 16 20 44 34 +77 14 45 44 20 +78 14 46 44 20 +79 14 45 44 34 +80 14 46 44 34 +81 15 45 44 46 + +Dihedrals + +1 20 2 1 3 4 +2 2 5 3 1 2 +3 21 11 1 3 4 +4 4 11 1 3 5 +5 2 9 11 1 2 +6 5 2 1 11 12 +7 4 3 1 11 9 +8 2 3 1 11 12 +9 22 1 3 4 13 +10 23 1 3 4 14 +11 22 5 3 4 13 +12 23 5 3 4 14 +13 2 1 3 5 6 +14 4 1 3 5 7 +15 20 6 5 3 4 +16 21 7 5 3 4 +17 24 3 4 14 15 +18 24 3 4 14 16 +19 25 3 4 14 34 +20 26 13 4 14 15 +21 26 13 4 14 16 +22 27 13 4 14 34 +23 2 3 5 7 8 +24 4 3 5 7 9 +25 5 6 5 7 8 +26 2 9 7 5 6 +27 2 5 7 9 10 +28 4 5 7 9 11 +29 5 8 7 9 10 +30 2 11 9 7 8 +31 4 7 9 11 1 +32 2 7 9 11 12 +33 2 1 11 9 10 +34 5 10 9 11 12 +35 28 4 14 34 33 +36 29 4 14 34 43 +37 30 4 14 34 44 +38 31 15 14 34 33 +39 32 15 14 34 43 +40 33 15 14 34 44 +41 31 16 14 34 33 +42 32 16 14 34 43 +43 33 16 14 34 44 +44 10 18 17 19 20 +45 2 21 19 17 18 +46 11 27 17 19 20 +47 4 27 17 19 21 +48 2 25 27 17 18 +49 5 18 17 27 28 +50 4 19 17 27 25 +51 2 19 17 27 28 +52 12 17 19 20 29 +53 13 17 19 20 30 +54 13 17 19 20 44 +55 12 21 19 20 29 +56 13 21 19 20 30 +57 13 21 19 20 44 +58 2 17 19 21 22 +59 4 17 19 21 23 +60 10 22 21 19 20 +61 11 23 21 19 20 +62 34 34 44 20 19 +63 31 45 44 20 19 +64 31 46 44 20 19 +65 35 34 44 20 29 +66 32 45 44 20 29 +67 32 46 44 20 29 +68 36 34 44 20 30 +69 33 45 44 20 30 +70 33 46 44 20 30 +71 2 19 21 23 24 +72 4 19 21 23 25 +73 5 22 21 23 24 +74 2 25 23 21 22 +75 2 21 23 25 26 +76 4 21 23 25 27 +77 5 24 23 25 26 +78 2 27 25 23 24 +79 4 23 25 27 17 +80 2 23 25 27 28 +81 2 17 27 25 26 +82 5 26 25 27 28 +83 10 32 31 33 34 +84 2 35 33 31 32 +85 11 41 31 33 34 +86 4 41 31 33 35 +87 2 39 41 31 32 +88 5 32 31 41 42 +89 4 33 31 41 39 +90 2 33 31 41 42 +91 13 31 33 34 14 +92 12 31 33 34 43 +93 13 31 33 34 44 +94 13 35 33 34 14 +95 12 35 33 34 43 +96 13 35 33 34 44 +97 2 31 33 35 36 +98 4 31 33 35 37 +99 10 36 35 33 34 +100 11 37 35 33 34 +101 36 20 44 34 14 +102 33 45 44 34 14 +103 33 46 44 34 14 +104 34 20 44 34 33 +105 31 45 44 34 33 +106 31 46 44 34 33 +107 35 20 44 34 43 +108 32 45 44 34 43 +109 32 46 44 34 43 +110 2 33 35 37 38 +111 4 33 35 37 39 +112 5 36 35 37 38 +113 2 39 37 35 36 +114 2 35 37 39 40 +115 4 35 37 39 41 +116 5 38 37 39 40 +117 2 41 39 37 38 +118 4 37 39 41 31 +119 2 37 39 41 42 +120 2 31 41 39 40 +121 5 40 39 41 42 + +Impropers + +1 1 3 1 11 2 +2 8 1 3 5 4 +3 9 3 4 13 14 +4 1 3 5 7 6 +5 1 5 7 9 8 +6 1 7 9 11 10 +7 1 1 11 9 12 +8 1 19 17 27 18 +9 5 17 19 21 20 +10 1 19 21 23 22 +11 1 21 23 25 24 +12 1 23 25 27 26 +13 1 17 27 25 28 +14 1 33 31 41 32 +15 5 31 33 35 34 +16 1 33 35 37 36 +17 1 35 37 39 38 +18 1 37 39 41 40 +19 1 31 41 39 42 +20 1 15 14 16 4 +21 1 15 14 4 34 +22 1 16 14 4 34 +23 1 15 14 16 34 +24 1 19 20 29 30 +25 1 19 20 29 44 +26 1 19 20 30 44 +27 1 29 20 30 44 +28 1 33 34 43 14 +29 1 33 34 14 44 +30 1 43 34 14 44 +31 1 33 34 43 44 +32 1 45 44 34 20 +33 1 46 44 34 20 +34 1 45 44 46 20 +35 1 45 44 46 34 diff --git a/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_pre.data_template b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_pre.data_template new file mode 100644 index 0000000000..d04fefccf5 --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/grow_styrene_pre.data_template @@ -0,0 +1,294 @@ +molecule template: end of styrene chain + +30 atoms +31 bonds +51 angles +73 dihedrals +21 impropers + +Types + +1 2 +2 2 +3 6 +4 2 +5 2 +6 1 +7 2 +8 1 +9 2 +10 1 +11 2 +12 1 +13 5 +14 1 +15 2 +16 1 +17 1 +18 2 +19 1 +20 5 +21 1 +22 2 +23 1 +24 2 +25 1 +26 2 +27 1 +28 2 +29 2 +30 6 + +Coords + +1 59.89981112372972 62.733697275315585 59.09884284578856 +2 61.41970248324232 63.42116581894993 59.52874545893742 +3 60.864754970096406 62.91724243011892 59.559720865992695 +4 62.139819000186826 61.41011937002877 60.81065044071466 +5 60.036455711425084 57.160029629288026 60.31958663310848 +6 59.734195751174056 58.18706337912225 60.20562410798949 +7 57.64574781117771 57.712432799329 59.860109977091554 +8 58.37408644866664 58.50134169314242 59.94422053768215 +9 56.94300092269842 60.093170109004795 59.5955638127831 +10 57.974275786582744 59.85577775892068 59.793714995577716 +11 58.63231375134033 61.922969938852454 59.79065033121885 +12 58.934573711591355 60.89593618901822 59.904612856337835 +13 61.30908151524225 61.68041745837013 60.28316188676589 +14 60.29468229868386 60.58165855333751 60.16601625920239 +15 61.725768540066994 58.98982945913568 60.51467315154424 +16 60.69449367618267 59.2272218092198 60.31652196874961 +17 56.90935800040509 62.609851248143706 59.150831390216375 +18 57.940632148874506 62.37245957639904 59.3489824055682 +19 56.509546622906285 63.96428799226142 59.00032568066915 +20 57.52394583946467 65.06304689729403 59.11747130823266 +21 55.14943732039887 64.27856630628159 58.738922110361806 +22 54.84717807556275 65.30559937777636 58.62495975268562 +23 54.18913939539026 63.23840787618404 58.62802424960169 +24 53.15786524692084 63.4757995479287 58.42987323424986 +25 54.58895077288906 61.88397113206633 58.77852995914891 +26 53.86061213540014 61.09506223825291 58.69441939855832 +27 55.94906007539648 61.56969281804616 59.039933529456256 +28 56.2513193202326 60.54265974655139 59.15389588713244 +29 58.35468332440925 64.79274880895268 59.64495986218142 +30 57.07961929431883 66.29987186904283 58.394030287459465 + +Charges + +1 0.0354 +2 0.0354 +3 -0.0696 +4 0.0516 +5 0.1403 +6 -0.1734 +7 0.1288 +8 -0.1134 +9 0.1403 +10 -0.1734 +11 0.1237 +12 -0.129 +13 -0.0182 +14 0.0266 +15 0.1237 +16 -0.129 +17 -0.129 +18 0.1237 +19 0.0266 +20 -0.0182 +21 -0.129 +22 0.1237 +23 -0.1734 +24 0.1403 +25 -0.1134 +26 0.1288 +27 -0.1734 +28 0.1403 +29 0.0516 +30 -0.0696 + +Bonds + +1 10 1 3 +2 10 2 3 +3 8 4 13 +4 1 6 5 +5 1 8 7 +6 2 8 6 +7 1 10 9 +8 2 10 8 +9 1 12 11 +10 2 12 10 +11 9 13 3 +12 7 14 13 +13 2 14 12 +14 1 16 15 +15 2 16 14 +16 2 16 6 +17 1 17 18 +18 2 17 19 +19 2 17 27 +20 7 19 20 +21 2 19 21 +22 9 20 30 +23 9 20 3 +24 1 21 22 +25 2 21 23 +26 1 23 24 +27 2 23 25 +28 1 25 26 +29 2 25 27 +30 1 27 28 +31 8 29 20 + +Angles + +1 16 20 3 13 +2 14 2 3 20 +3 14 1 3 20 +4 14 2 3 13 +5 14 1 3 13 +6 15 2 3 1 +7 2 16 6 8 +8 1 16 6 5 +9 1 8 6 5 +10 1 10 8 7 +11 2 10 8 6 +12 1 6 8 7 +13 1 12 10 9 +14 2 12 10 8 +15 1 8 10 9 +16 1 14 12 11 +17 2 14 12 10 +18 1 10 12 11 +19 10 14 13 4 +20 11 14 13 3 +21 12 4 13 3 +22 9 16 14 13 +23 2 16 14 12 +24 9 12 14 13 +25 1 14 16 15 +26 1 6 16 15 +27 2 14 16 6 +28 1 19 17 18 +29 1 27 17 18 +30 2 19 17 27 +31 9 17 19 20 +32 2 17 19 21 +33 9 21 19 20 +34 10 19 20 29 +35 11 19 20 30 +36 11 19 20 3 +37 12 29 20 30 +38 12 29 20 3 +39 13 30 20 3 +40 1 19 21 22 +41 2 19 21 23 +42 1 23 21 22 +43 1 21 23 24 +44 2 21 23 25 +45 1 25 23 24 +46 1 23 25 26 +47 2 23 25 27 +48 1 27 25 26 +49 2 17 27 25 +50 1 17 27 28 +51 1 25 27 28 + +Dihedrals + +1 2 8 6 16 15 +2 2 16 6 8 7 +3 2 6 8 10 9 +4 4 10 8 6 16 +5 2 10 8 6 5 +6 5 7 8 6 5 +7 2 8 10 12 11 +8 2 12 10 8 7 +9 4 12 10 8 6 +10 5 9 10 8 7 +11 10 11 12 14 13 +12 11 10 12 14 13 +13 2 14 12 10 9 +14 4 14 12 10 8 +15 5 11 12 10 9 +16 17 14 13 3 20 +17 14 14 13 3 2 +18 14 14 13 3 1 +19 18 4 13 3 20 +20 15 4 13 3 2 +21 15 4 13 3 1 +22 2 12 14 16 15 +23 12 16 14 13 4 +24 13 16 14 13 3 +25 12 12 14 13 4 +26 13 12 14 13 3 +27 2 16 14 12 11 +28 4 16 14 12 10 +29 10 15 16 14 13 +30 11 6 16 14 13 +31 4 6 16 14 12 +32 5 15 16 6 5 +33 4 14 16 6 8 +34 2 14 16 6 5 +35 10 18 17 19 20 +36 11 27 17 19 20 +37 4 27 17 19 21 +38 5 18 17 27 28 +39 4 19 17 27 25 +40 2 19 17 27 28 +41 2 21 19 17 18 +42 12 17 19 20 29 +43 13 17 19 20 30 +44 13 17 19 20 3 +45 12 21 19 20 29 +46 13 21 19 20 30 +47 13 21 19 20 3 +48 2 17 19 21 22 +49 4 17 19 21 23 +50 17 19 20 3 13 +51 14 19 20 3 2 +52 14 19 20 3 1 +53 18 29 20 3 13 +54 15 29 20 3 2 +55 15 29 20 3 1 +56 19 30 20 3 13 +57 16 30 20 3 2 +58 16 30 20 3 1 +59 10 22 21 19 20 +60 11 23 21 19 20 +61 2 19 21 23 24 +62 4 19 21 23 25 +63 5 22 21 23 24 +64 2 25 23 21 22 +65 2 21 23 25 26 +66 4 21 23 25 27 +67 5 24 23 25 26 +68 2 27 25 23 24 +69 4 23 25 27 17 +70 2 23 25 27 28 +71 5 26 25 27 28 +72 2 25 27 17 18 +73 2 17 27 25 26 + +Impropers + +1 1 2 3 13 20 +2 1 1 3 13 20 +3 1 2 3 1 20 +4 1 2 3 1 13 +5 1 16 6 8 5 +6 1 10 8 6 7 +7 1 12 10 8 9 +8 1 14 12 10 11 +9 7 14 13 4 3 +10 5 16 14 12 13 +11 1 14 16 6 15 +12 1 19 17 27 18 +13 5 17 19 21 20 +14 1 19 20 29 30 +15 1 19 20 29 3 +16 1 19 20 30 3 +17 1 29 20 30 3 +18 1 19 21 23 22 +19 1 21 23 25 24 +20 1 23 25 27 26 +21 1 17 27 25 28 diff --git a/examples/USER/reaction/create_atoms_polystyrene/in.grow_styrene b/examples/USER/reaction/create_atoms_polystyrene/in.grow_styrene new file mode 100755 index 0000000000..8950cec614 --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/in.grow_styrene @@ -0,0 +1,48 @@ +# use bond/react 'create atoms' feature to add 30 new styrene monomers to chain + +units real + +boundary p p p + +atom_style full + +kspace_style pppm 1.0e-4 + +pair_style lj/class2/coul/long 8.5 + +angle_style class2 + +bond_style class2 + +dihedral_style class2 + +improper_style class2 + +variable T equal 530 + +read_data trimer.data & + extra/bond/per/atom 5 & + extra/angle/per/atom 15 & + extra/dihedral/per/atom 15 & + extra/improper/per/atom 25 & + extra/special/per/atom 25 + +molecule mol1 grow_styrene_pre.data_template +molecule mol2 grow_styrene_post.data_template + +fix myrxns all bond/react stabilization yes statted_grp .03 & + react rxn1 all 1 0 3.0 mol1 mol2 grow_styrene.map & + modify_create fit create_fit overlap 2.0 & + stabilize_steps 100 max_rxn 30 + +fix 1 statted_grp_REACT nvt temp $T $T 100 + +fix 4 bond_react_MASTER_group temp/rescale 1 $T $T 1 1 + +thermo_style custom step temp press density f_myrxns[1] + +thermo 100 + +run 8000 + +# write_data final.data nofix diff --git a/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.1 b/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.1 new file mode 100644 index 0000000000..5f1f2c6698 --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.1 @@ -0,0 +1,196 @@ +LAMMPS (24 Dec 2020) +Reading data file ... + orthogonal box = (50.000000 50.000000 50.000000) to (250.00000 250.00000 250.00000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 48 atoms + reading velocities ... + 48 velocities + scanning bonds ... + 8 = max bonds/atom + scanning angles ... + 21 = max angles/atom + scanning dihedrals ... + 33 = max dihedrals/atom + scanning impropers ... + 29 = max impropers/atom + reading bonds ... + 50 bonds + reading angles ... + 84 angles + reading dihedrals ... + 127 dihedrals + reading impropers ... + 36 impropers +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 4 = max # of 1-2 neighbors + 8 = max # of 1-3 neighbors + 17 = max # of 1-4 neighbors + 46 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.077 seconds +Read molecule template mol1: + 1 molecules + 30 atoms with max type 6 + 31 bonds with max type 10 + 51 angles with max type 16 + 73 dihedrals with max type 19 + 21 impropers with max type 7 +Read molecule template mol2: + 1 molecules + 46 atoms with max type 6 + 48 bonds with max type 13 + 81 angles with max type 22 + 121 dihedrals with max type 36 + 35 impropers with max type 9 +dynamic group bond_react_MASTER_group defined +dynamic group statted_grp_REACT defined +PPPM initialization ... +WARNING: System is not charge neutral, net charge = -0.00060000000 (../kspace.cpp:324) + using 12-bit tables for long-range coulomb (../kspace.cpp:339) + G vector (1/distance) = 0.20144813 + grid = 45 45 45 + stencil order = 5 + estimated absolute RMS force accuracy = 0.00053712952 + estimated relative force accuracy = 1.6175496e-06 + using double precision KISS FFT + 3d grid and FFT values/proc = 125000 91125 +Neighbor list info ... + update every 1 steps, delay 10 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 10.5 + ghost atom cutoff = 10.5 + binsize = 5.25, bins = 39 39 39 + 2 neighbor lists, perpetual/occasional/extra = 1 1 0 + (1) pair lj/class2/coul/long, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d/newton + bin: standard + (2) fix bond/react, occasional, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 1 +Per MPI rank memory allocation (min/avg/max) = 48.02 | 48.02 | 48.02 Mbytes +Step Temp Press Density f_myrxns[1] + 0 496.23742 0.9983211 6.4856516e-05 0 + 100 534.05394 -0.76952227 6.4856516e-05 0 + 200 552.2225 -0.55375493 6.4856516e-05 0 + 300 857.52834 -0.4272061 8.6475354e-05 1 + 400 714.10681 1.5004615 8.6475354e-05 1 + 500 678.19171 0.21965471 8.6475354e-05 1 + 600 572.3234 0.87879933 8.6475354e-05 1 + 700 996.17398 -0.24269717 0.00010809419 2 + 800 904.50395 1.3662054 0.00010809419 2 + 900 1097.1568 -2.2909907 0.00012971303 3 + 1000 954.08892 1.7705672 0.00012971303 3 + 1100 1102.0377 -1.7018446 0.00015133187 4 + 1200 1239.785 -0.30442903 0.00015133187 4 + 1300 1388.4127 1.3301175 0.00017295071 5 + 1400 1559.3853 1.6709729 0.00017295071 5 + 1500 1471.8623 0.8268427 0.00017295071 5 + 1600 1543.6793 2.1987908 0.00019456955 6 + 1700 1694.5595 0.48852817 0.00019456955 6 + 1800 1632.7737 -1.4617692 0.00021618839 7 + 1900 1922.6502 1.1664257 0.00021618839 7 + 2000 2223.503 -0.95799878 0.00023780722 8 + 2100 2142.6035 0.88444463 0.00025942606 9 + 2200 2298.8636 3.4239313 0.00025942606 9 + 2300 2252.4355 0.82167302 0.00025942606 9 + 2400 2321.0788 1.7499714 0.00025942606 9 + 2500 2095.6715 0.55288444 0.00025942606 9 + 2600 2136.0316 -3.833114 0.00025942606 9 + 2700 2466.3134 -2.2519511 0.00025942606 9 + 2800 2294.3454 1.0637304 0.00025942606 9 + 2900 2340.3891 1.3997049 0.0002810449 10 + 3000 2272.0013 -0.27591886 0.0002810449 10 + 3100 2333.9696 -0.11772138 0.0002810449 10 + 3200 2409.0946 -1.025473 0.0002810449 10 + 3300 2148.023 1.6752329 0.0002810449 10 + 3400 2267.636 -0.45297583 0.0002810449 10 + 3500 2457.622 0.35627297 0.0002810449 10 + 3600 2288.008 -15.516626 0.00030266374 11 + 3700 2458.2681 1.4571773 0.00030266374 11 + 3800 2566.7623 -29.140553 0.00032428258 12 + 3900 2839.4062 0.64583638 0.00032428258 12 + 4000 2893.9852 -52.954497 0.00034590142 13 + 4100 3021.3611 -65.03731 0.00036752025 14 + 4200 3002.7136 1.5750081 0.00036752025 14 + 4300 3218.6248 -120.74039 0.00038913909 15 + 4400 3345.1482 -0.96545269 0.00038913909 15 + 4500 3603.2429 1.2438833 0.00038913909 15 + 4600 3129.8814 -249.91806 0.00041075793 16 + 4700 3769.052 -289.24351 0.00043237677 17 + 4800 3560.4714 -3.1655406 0.00043237677 17 + 4900 3452.2717 -2.1270765 0.00043237677 17 + 5000 3594.3247 -523.48506 0.00045399561 18 + 5100 3578.4199 1.0009097 0.00045399561 18 + 5200 3822.1566 1.0526914 0.00047561445 19 + 5300 3901.8883 -0.14607602 0.00047561445 19 + 5400 4059.3644 -1.7789927 0.00049723329 20 + 5500 4163.6847 1.0240127 0.00049723329 20 + 5600 4109.1649 0.80199787 0.00049723329 20 + 5700 4391.2091 2.8730036 0.00049723329 20 + 5800 4279.6579 -0.36499822 0.00051885212 21 + 5900 4296.2695 -1.3064528 0.00051885212 21 + 6000 4065.3758 -2.0483224 0.00051885212 21 + 6100 4772.5362 -2.6814694 0.00054047096 22 + 6200 4627.029 2.999215 0.0005620898 23 + 6300 5120.7881 0.65372968 0.00058370864 24 + 6400 4588.9559 3.7570705 0.00058370864 24 + 6500 5008.7814 2.3595833 0.00060532748 25 + 6600 5195.0053 1.4641612 0.00060532748 25 + 6700 5622.293 -0.33396047 0.00062694632 26 + 6800 5515.1957 -4.234874 0.00062694632 26 + 6900 5156.7455 0.40171954 0.00064856516 27 + 7000 5120.1639 -1.6065245 0.00064856516 27 + 7100 5650.0327 0.94436323 0.00067018399 28 + 7200 5985.1115 -3.8940347 0.00069180283 29 + 7300 5983.197 0.5293568 0.00069180283 29 + 7400 6001.1559 -0.13712834 0.00071342167 30 + 7500 5889.2134 0.17230892 0.00071342167 30 + 7600 5797.31 2.0920058 0.00071342167 30 + 7700 5865.2783 -0.18556395 0.00071342167 30 + 7800 6207.0659 -5.6237083 0.00071342167 30 + 7900 5627.5108 -2.3718942 0.00071342167 30 + 8000 5823.9502 -0.85418578 0.00071342167 30 +Loop time of 184.87 on 1 procs for 8000 steps with 528 atoms + +Performance: 3.739 ns/day, 6.419 hours/ns, 43.274 timesteps/s +99.9% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 3.3043 | 3.3043 | 3.3043 | 0.0 | 1.79 +Bond | 8.0003 | 8.0003 | 8.0003 | 0.0 | 4.33 +Kspace | 168.33 | 168.33 | 168.33 | 0.0 | 91.05 +Neigh | 4.6322 | 4.6322 | 4.6322 | 0.0 | 2.51 +Comm | 0.077927 | 0.077927 | 0.077927 | 0.0 | 0.04 +Output | 0.0020548 | 0.0020548 | 0.0020548 | 0.0 | 0.00 +Modify | 0.5005 | 0.5005 | 0.5005 | 0.0 | 0.27 +Other | | 0.02483 | | | 0.01 + +Nlocal: 528.000 ave 528 max 528 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 341.000 ave 341 max 341 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 35111.0 ave 35111 max 35111 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 35111 +Ave neighs/atom = 66.498106 +Ave special neighs/atom = 11.409091 +Neighbor list builds = 8000 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:03:05 diff --git a/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.4 b/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.4 new file mode 100644 index 0000000000..8daa6d8161 --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/log.24Dec20.grow_styrene.g++.4 @@ -0,0 +1,196 @@ +LAMMPS (24 Dec 2020) +Reading data file ... + orthogonal box = (50.000000 50.000000 50.000000) to (250.00000 250.00000 250.00000) + 1 by 2 by 2 MPI processor grid + reading atoms ... + 48 atoms + reading velocities ... + 48 velocities + scanning bonds ... + 8 = max bonds/atom + scanning angles ... + 21 = max angles/atom + scanning dihedrals ... + 33 = max dihedrals/atom + scanning impropers ... + 29 = max impropers/atom + reading bonds ... + 50 bonds + reading angles ... + 84 angles + reading dihedrals ... + 127 dihedrals + reading impropers ... + 36 impropers +Finding 1-2 1-3 1-4 neighbors ... + special bond factors lj: 0 0 0 + special bond factors coul: 0 0 0 + 4 = max # of 1-2 neighbors + 8 = max # of 1-3 neighbors + 17 = max # of 1-4 neighbors + 46 = max # of special neighbors + special bonds CPU = 0.000 seconds + read_data CPU = 0.007 seconds +Read molecule template mol1: + 1 molecules + 30 atoms with max type 6 + 31 bonds with max type 10 + 51 angles with max type 16 + 73 dihedrals with max type 19 + 21 impropers with max type 7 +Read molecule template mol2: + 1 molecules + 46 atoms with max type 6 + 48 bonds with max type 13 + 81 angles with max type 22 + 121 dihedrals with max type 36 + 35 impropers with max type 9 +dynamic group bond_react_MASTER_group defined +dynamic group statted_grp_REACT defined +PPPM initialization ... +WARNING: System is not charge neutral, net charge = -0.00060000000 (../kspace.cpp:324) + using 12-bit tables for long-range coulomb (../kspace.cpp:339) + G vector (1/distance) = 0.20144813 + grid = 45 45 45 + stencil order = 5 + estimated absolute RMS force accuracy = 0.00053712952 + estimated relative force accuracy = 1.6175496e-06 + using double precision KISS FFT + 3d grid and FFT values/proc = 39200 24300 +Neighbor list info ... + update every 1 steps, delay 10 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 10.5 + ghost atom cutoff = 10.5 + binsize = 5.25, bins = 39 39 39 + 2 neighbor lists, perpetual/occasional/extra = 1 1 0 + (1) pair lj/class2/coul/long, perpetual + attributes: half, newton on + pair build: half/bin/newton + stencil: half/bin/3d/newton + bin: standard + (2) fix bond/react, occasional, copy from (1) + attributes: half, newton on + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 1 +Per MPI rank memory allocation (min/avg/max) = 38.70 | 38.92 | 39.43 Mbytes +Step Temp Press Density f_myrxns[1] + 0 496.23742 0.9983211 6.4856516e-05 0 + 100 534.05394 -0.76952227 6.4856516e-05 0 + 200 552.2225 -0.55375493 6.4856516e-05 0 + 300 857.52834 -0.4272061 8.6475354e-05 1 + 400 714.10681 1.5004615 8.6475354e-05 1 + 500 678.19171 0.21965471 8.6475354e-05 1 + 600 572.3234 0.87879933 8.6475354e-05 1 + 700 996.17398 -0.24269717 0.00010809419 2 + 800 904.50395 1.3662054 0.00010809419 2 + 900 1097.1568 -2.2909907 0.00012971303 3 + 1000 954.08892 1.7705672 0.00012971303 3 + 1100 1102.0377 -1.7018446 0.00015133187 4 + 1200 1239.785 -0.30442903 0.00015133187 4 + 1300 1388.4127 1.3301175 0.00017295071 5 + 1400 1559.3853 1.6709729 0.00017295071 5 + 1500 1471.8623 0.8268427 0.00017295071 5 + 1600 1543.6793 2.1987908 0.00019456955 6 + 1700 1694.5595 0.48852817 0.00019456955 6 + 1800 1632.7737 -1.4617692 0.00021618839 7 + 1900 1922.6502 1.1664257 0.00021618839 7 + 2000 2223.503 -0.95799878 0.00023780722 8 + 2100 2142.6035 0.88444463 0.00025942606 9 + 2200 2298.8636 3.4239313 0.00025942606 9 + 2300 2252.4355 0.82167302 0.00025942606 9 + 2400 2321.0788 1.7499714 0.00025942606 9 + 2500 2095.6715 0.55288444 0.00025942606 9 + 2600 2136.0316 -3.833114 0.00025942606 9 + 2700 2466.3134 -2.2519511 0.00025942606 9 + 2800 2294.3454 1.0637304 0.00025942606 9 + 2900 2340.3891 1.3997049 0.0002810449 10 + 3000 2272.0013 -0.27591886 0.0002810449 10 + 3100 2333.9696 -0.11772138 0.0002810449 10 + 3200 2409.0946 -1.025473 0.0002810449 10 + 3300 2148.023 1.6752329 0.0002810449 10 + 3400 2267.636 -0.45297583 0.0002810449 10 + 3500 2457.622 0.35627297 0.0002810449 10 + 3600 2288.008 -15.516626 0.00030266374 11 + 3700 2458.2681 1.4571773 0.00030266374 11 + 3800 2566.7623 -29.140553 0.00032428258 12 + 3900 2839.4062 0.64583638 0.00032428258 12 + 4000 2893.2204 -53.187892 0.00034590142 13 + 4100 3024.6375 -65.068146 0.00036752025 14 + 4200 3004.6784 1.4155214 0.00036752025 14 + 4300 3033.1895 1.8572273 0.00036752025 14 + 4400 3157.2542 -0.92462977 0.00036752025 14 + 4500 3557.7137 -194.46498 0.00038913909 15 + 4600 3096.485 -1.830492 0.00038913909 15 + 4700 3488.088 -286.81055 0.00041075793 16 + 4800 3390.5493 -372.77818 0.00043237677 17 + 4900 3773.7226 -446.58574 0.00045399561 18 + 5000 3703.0159 -0.81188551 0.00045399561 18 + 5100 4051.3067 1.2567439 0.00045399561 18 + 5200 3813.3682 0.92945737 0.00047561445 19 + 5300 4036.0078 -2.5336258 0.00049723329 20 + 5400 4219.803 -0.96928261 0.00049723329 20 + 5500 4433.7447 -0.026762463 0.00051885212 21 + 5600 4477.4505 -1.417316 0.00054047096 22 + 5700 4500.0306 -1.0551443 0.00054047096 22 + 5800 4600.3507 -4.9580056 0.00054047096 22 + 5900 4765.4978 -2.2546941 0.0005620898 23 + 6000 5442.6193 0.91161284 0.00058370864 24 + 6100 5086.8047 -0.9875332 0.00060532748 25 + 6200 5485.3437 -2.8296626 0.00062694632 26 + 6300 4988.0396 -0.15179023 0.00064856516 27 + 6400 5597.3703 4.2941885 0.00067018399 28 + 6500 5677.0263 -2.8611595 0.00069180283 29 + 6600 6058.0009 1.4111778 0.00071342167 30 + 6700 5859.0817 -2.5782466 0.00071342167 30 + 6800 5879.3941 -4.5681807 0.00071342167 30 + 6900 6398.288 2.5259412 0.00071342167 30 + 7000 6250.1096 -2.6049627 0.00071342167 30 + 7100 5849.651 -0.44062578 0.00071342167 30 + 7200 5778.6532 -0.27299118 0.00071342167 30 + 7300 5977.6661 4.2483639 0.00071342167 30 + 7400 5862.4231 1.0289519 0.00071342167 30 + 7500 6482.376 7.5412373 0.00071342167 30 + 7600 5810.4325 1.0343075 0.00071342167 30 + 7700 5916.7304 2.304302 0.00071342167 30 + 7800 5869.9504 -0.5946555 0.00071342167 30 + 7900 5804.0522 -4.1207689 0.00071342167 30 + 8000 6077.1704 0.52211243 0.00071342167 30 +Loop time of 60.5603 on 4 procs for 8000 steps with 528 atoms + +Performance: 11.413 ns/day, 2.103 hours/ns, 132.100 timesteps/s +99.9% CPU use with 4 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.0041695 | 0.90113 | 2.3423 | 102.8 | 1.49 +Bond | 0.011606 | 2.1188 | 5.8107 | 163.9 | 3.50 +Kspace | 47.987 | 52.817 | 55.679 | 43.7 | 87.21 +Neigh | 3.5961 | 3.6262 | 3.6496 | 1.2 | 5.99 +Comm | 0.11097 | 0.16569 | 0.26369 | 15.3 | 0.27 +Output | 0.0020366 | 0.0023427 | 0.0032469 | 1.1 | 0.00 +Modify | 0.62302 | 0.91659 | 1.1227 | 21.5 | 1.51 +Other | | 0.0126 | | | 0.02 + +Nlocal: 132.000 ave 295 max 0 min +Histogram: 2 0 0 0 0 0 0 1 0 1 +Nghost: 133.000 ave 349 max 0 min +Histogram: 2 0 0 0 0 1 0 0 0 1 +Neighs: 8383.50 ave 20143 max 0 min +Histogram: 2 0 0 0 0 0 1 0 0 1 + +Total # of neighbors = 33534 +Ave neighs/atom = 63.511364 +Ave special neighs/atom = 11.409091 +Neighbor list builds = 8000 +Dangerous builds = 0 + +Please see the log.cite file for references relevant to this simulation + +Total wall time: 0:01:00 diff --git a/examples/USER/reaction/create_atoms_polystyrene/trimer.data b/examples/USER/reaction/create_atoms_polystyrene/trimer.data new file mode 100644 index 0000000000..b3ad132f03 --- /dev/null +++ b/examples/USER/reaction/create_atoms_polystyrene/trimer.data @@ -0,0 +1,796 @@ +polystyrene trimer + +48 atoms +7 atom types +50 bonds +13 bond types +84 angles +22 angle types +127 dihedrals +36 dihedral types +36 impropers +9 improper types + +50 250 xlo xhi +50 250 ylo yhi +50 250 zlo zhi + +Masses + +1 12.0112 +2 1.00797 +3 12.0112 +4 12.0112 +5 12.0112 +6 12.0112 +7 12.0112 + +Pair Coeffs # lj/class2/coul/long + +1 0.064 4.01 +2 0.02 2.7 +3 0.064 4.01 +4 0.064 3.9 +5 0.054 4.01 +6 0.054 4.01 +7 0.054 4.01 + +Bond Coeffs # class2 + +1 1.0982 372.825 -803.453 894.317 +2 1.417 470.836 -627.618 1327.63 +3 1.501 321.902 -521.821 572.163 +4 1.0883 365.768 -725.54 781.662 +5 1.34 543.99 -1238.2 1644.03 +6 1.0883 365.768 -725.54 781.662 +7 1.501 321.902 -521.821 572.163 +8 1.101 345 -691.89 844.6 +9 1.53 299.67 -501.77 679.81 +10 1.101 345 -691.89 844.6 +11 1.501 321.902 -521.821 572.163 +12 1.101 345 -691.89 844.6 +13 1.53 299.67 -501.77 679.81 + +Angle Coeffs # class2 + +1 117.94 35.1558 -12.4682 0 +2 118.9 61.0226 -34.9931 0 +3 120.05 44.7148 -22.7352 0 +4 111 44.3234 -9.4454 0 +5 108.4 43.9594 -8.3924 -9.3379 +6 124.88 35.2766 -17.774 -1.6215 +7 124.88 35.2766 -17.774 -1.6215 +8 115.49 29.6363 -12.4853 -6.2218 +9 120.05 44.7148 -22.7352 0 +10 111 44.3234 -9.4454 0 +11 108.4 43.9594 -8.3924 -9.3379 +12 110.77 41.453 -10.604 5.129 +13 112.67 39.516 -7.443 -9.5583 +14 110.77 41.453 -10.604 5.129 +15 107.66 39.641 -12.921 -2.4318 +16 112.67 39.516 -7.443 -9.5583 +17 120.05 44.7148 -22.7352 0 +18 111 44.3234 -9.4454 0 +19 108.4 43.9594 -8.3924 -9.3379 +20 110.77 41.453 -10.604 5.129 +21 110.77 41.453 -10.604 5.129 +22 112.67 39.516 -7.443 -9.5583 + +BondBond Coeffs + +1 1.0795 1.417 1.0982 +2 68.2856 1.417 1.417 +3 12.0676 1.417 1.501 +4 2.9168 1.501 1.0883 +5 0 1.501 1.34 +6 10.1047 1.0883 1.34 +7 10.1047 1.0883 1.34 +8 4.8506 1.0883 1.0883 +9 12.0676 1.417 1.501 +10 2.9168 1.501 1.101 +11 0 1.501 1.53 +12 3.3872 1.101 1.53 +13 0 1.53 1.53 +14 3.3872 1.101 1.53 +15 5.3316 1.101 1.101 +16 0 1.53 1.53 +17 12.0676 1.417 1.501 +18 2.9168 1.501 1.101 +19 0 1.501 1.53 +20 3.3872 1.101 1.53 +21 3.3872 1.101 1.53 +22 0 1.53 1.53 + +BondAngle Coeffs + +1 20.0033 24.2183 1.417 1.0982 +2 28.8708 28.8708 1.417 1.417 +3 31.0771 47.0579 1.417 1.501 +4 26.4608 11.7717 1.501 1.0883 +5 0 0 1.501 1.34 +6 19.0592 23.3588 1.0883 1.34 +7 19.0592 23.3588 1.0883 1.34 +8 17.9795 17.9795 1.0883 1.0883 +9 31.0771 47.0579 1.417 1.501 +10 26.4608 11.7717 1.501 1.101 +11 0 0 1.501 1.53 +12 11.421 20.754 1.101 1.53 +13 8.016 8.016 1.53 1.53 +14 11.421 20.754 1.101 1.53 +15 18.103 18.103 1.101 1.101 +16 8.016 8.016 1.53 1.53 +17 31.0771 47.0579 1.417 1.501 +18 26.4608 11.7717 1.501 1.101 +19 0 0 1.501 1.53 +20 11.421 20.754 1.101 1.53 +21 11.421 20.754 1.101 1.53 +22 8.016 8.016 1.53 1.53 + +Dihedral Coeffs # class2 + +1 0 0 1.559 0 0 0 +2 0 0 3.9661 0 0 0 +3 0 0 4.4072 0 0 0 +4 8.3667 0 1.1932 0 0 0 +5 0 0 1.8769 0 0 0 +6 0 0 0 0 0 0 +7 0 0 0 0 0 0 +8 0 0 0 0 0 0 +9 0 0 4.8974 0 0 0 +10 0 0 1.559 0 0 0 +11 0 0 4.4072 0 0 0 +12 -0.2801 0 -0.0678 0 -0.0122 0 +13 -0.2802 0 -0.0678 0 -0.0122 0 +14 -0.0228 0 0.028 0 -0.1863 0 +15 -0.1432 0 0.0617 0 -0.1083 0 +16 0 0 0.0316 0 -0.1681 0 +17 0 0 0 0 0 0 +18 0 0 0.0316 0 -0.1681 0 +19 0 0 0.0514 0 -0.143 0 +20 0 0 1.559 0 0 0 +21 0 0 4.4072 0 0 0 +22 -0.2801 0 -0.0678 0 -0.0122 0 +23 -0.2802 0 -0.0678 0 -0.0122 0 +24 -0.0228 0 0.028 0 -0.1863 0 +25 0 0 0 0 0 0 +26 -0.1432 0 0.0617 0 -0.1083 0 +27 0 0 0.0316 0 -0.1681 0 +28 0 0 0 0 0 0 +29 0 0 0.0316 0 -0.1681 0 +30 0 0 0.0514 0 -0.143 0 +31 -0.0228 0 0.028 0 -0.1863 0 +32 -0.1432 0 0.0617 0 -0.1083 0 +33 0 0 0.0316 0 -0.1681 0 +34 0 0 0 0 0 0 +35 0 0 0.0316 0 -0.1681 0 +36 0 0 0.0514 0 -0.143 0 + +AngleAngleTorsion Coeffs + +1 4.4444 117.94 120.05 +2 -4.8141 118.9 117.94 +3 -14.4097 118.9 120.05 +4 0 118.9 118.9 +5 0.3598 117.94 117.94 +6 0 120.05 111 +7 0 120.05 108.4 +8 0 108.4 124.88 +9 -7.0058 124.88 124.88 +10 4.4444 117.94 120.05 +11 -14.4097 118.9 120.05 +12 -5.8888 120.05 111 +13 0 120.05 108.4 +14 0 108.4 110.77 +15 -12.564 110.77 110.77 +16 -16.164 112.67 110.77 +17 0 108.4 112.67 +18 -16.164 110.77 112.67 +19 -22.045 112.67 112.67 +20 4.4444 117.94 120.05 +21 -14.4097 118.9 120.05 +22 -5.8888 120.05 111 +23 0 120.05 108.4 +24 0 108.4 110.77 +25 0 108.4 112.67 +26 -12.564 110.77 110.77 +27 -16.164 110.77 112.67 +28 0 112.67 108.4 +29 -16.164 112.67 110.77 +30 -22.045 112.67 112.67 +31 0 110.77 108.4 +32 -12.564 110.77 110.77 +33 -16.164 110.77 112.67 +34 0 112.67 108.4 +35 -16.164 112.67 110.77 +36 -22.045 112.67 112.67 + +EndBondTorsion Coeffs + +1 0 -0.4879 0 0 -1.797 0 1.0982 1.501 +2 0 -6.8958 0 0 -0.4669 0 1.417 1.0982 +3 0 -0.6918 0 0 0.2421 0 1.417 1.501 +4 -0.1185 6.3204 0 -0.1185 6.3204 0 1.417 1.417 +5 0 -0.689 0 0 -0.689 0 1.0982 1.0982 +6 0 0 0 0 0 0 1.417 1.0883 +7 0 0 0 0 0 0 1.417 1.34 +8 0 0 0 0 0 0 1.501 1.0883 +9 0.7129 0.5161 0 0.7129 0.5161 0 1.0883 1.0883 +10 0 -0.4879 0 0 -1.797 0 1.0982 1.501 +11 0 -0.6918 0 0 0.2421 0 1.417 1.501 +12 -0.5835 1.122 0.3978 1.3997 0.7756 0 1.417 1.101 +13 0 0 0 0 0 0 1.417 1.53 +14 0 0 0 0 0 0 1.501 1.101 +15 0.213 0.312 0.0777 0.213 0.312 0.0777 1.101 1.101 +16 0.2486 0.2422 -0.0925 0.0814 0.0591 0.2219 1.53 1.101 +17 0 0 0 0 0 0 1.501 1.53 +18 0.0814 0.0591 0.2219 0.2486 0.2422 -0.0925 1.101 1.53 +19 -0.0732 0 0 -0.0732 0 0 1.53 1.53 +20 0 -0.4879 0 0 -1.797 0 1.0982 1.501 +21 0 -0.6918 0 0 0.2421 0 1.417 1.501 +22 -0.5835 1.122 0.3978 1.3997 0.7756 0 1.417 1.101 +23 0 0 0 0 0 0 1.417 1.53 +24 0 0 0 0 0 0 1.501 1.101 +25 0 0 0 0 0 0 1.501 1.53 +26 0.213 0.312 0.0777 0.213 0.312 0.0777 1.101 1.101 +27 0.0814 0.0591 0.2219 0.2486 0.2422 -0.0925 1.101 1.53 +28 0 0 0 0 0 0 1.53 1.501 +29 0.2486 0.2422 -0.0925 0.0814 0.0591 0.2219 1.53 1.101 +30 -0.0732 0 0 -0.0732 0 0 1.53 1.53 +31 0 0 0 0 0 0 1.101 1.501 +32 0.213 0.312 0.0777 0.213 0.312 0.0777 1.101 1.101 +33 0.0814 0.0591 0.2219 0.2486 0.2422 -0.0925 1.101 1.53 +34 0 0 0 0 0 0 1.53 1.501 +35 0.2486 0.2422 -0.0925 0.0814 0.0591 0.2219 1.53 1.101 +36 -0.0732 0 0 -0.0732 0 0 1.53 1.53 + +MiddleBondTorsion Coeffs + +1 0 3.9421 0 1.417 +2 0 -1.1521 0 1.417 +3 0 9.1792 0 1.417 +4 27.5989 -2.312 0 1.417 +5 0 4.8228 0 1.417 +6 0 0 0 1.501 +7 0 0 0 1.501 +8 0 0 0 1.34 +9 0.8558 6.3911 0 1.34 +10 0 3.9421 0 1.417 +11 0 9.1792 0 1.417 +12 -5.5679 1.4083 0.301 1.501 +13 0 0 0 1.501 +14 0 0 0 1.53 +15 -14.261 -0.5322 -0.4864 1.53 +16 -14.879 -3.6581 -0.3138 1.53 +17 0 0 0 1.53 +18 -14.879 -3.6581 -0.3138 1.53 +19 -17.787 -7.1877 0 1.53 +20 0 3.9421 0 1.417 +21 0 9.1792 0 1.417 +22 -5.5679 1.4083 0.301 1.501 +23 0 0 0 1.501 +24 0 0 0 1.53 +25 0 0 0 1.53 +26 -14.261 -0.5322 -0.4864 1.53 +27 -14.879 -3.6581 -0.3138 1.53 +28 0 0 0 1.53 +29 -14.879 -3.6581 -0.3138 1.53 +30 -17.787 -7.1877 0 1.53 +31 0 0 0 1.53 +32 -14.261 -0.5322 -0.4864 1.53 +33 -14.879 -3.6581 -0.3138 1.53 +34 0 0 0 1.53 +35 -14.879 -3.6581 -0.3138 1.53 +36 -17.787 -7.1877 0 1.53 + +BondBond13 Coeffs + +1 0.8743 1.0982 1.501 +2 -6.2741 1.417 1.0982 +3 2.5085 1.417 1.501 +4 53 1.417 1.417 +5 -1.7077 1.0982 1.0982 +6 0 1.417 1.0883 +7 0 1.417 1.34 +8 0 1.501 1.0883 +9 0 1.0883 1.0883 +10 0.8743 1.0982 1.501 +11 2.5085 1.417 1.501 +12 -3.4826 1.417 1.101 +13 0 1.417 1.53 +14 0 1.501 1.101 +15 0 1.101 1.101 +16 0 1.53 1.101 +17 0 1.501 1.53 +18 0 1.101 1.53 +19 0 1.53 1.53 +20 0.8743 1.0982 1.501 +21 2.5085 1.417 1.501 +22 -3.4826 1.417 1.101 +23 0 1.417 1.53 +24 0 1.501 1.101 +25 0 1.501 1.53 +26 0 1.101 1.101 +27 0 1.101 1.53 +28 0 1.53 1.501 +29 0 1.53 1.101 +30 0 1.53 1.53 +31 0 1.101 1.501 +32 0 1.101 1.101 +33 0 1.101 1.53 +34 0 1.53 1.501 +35 0 1.53 1.101 +36 0 1.53 1.53 + +AngleTorsion Coeffs + +1 0 3.4601 0 0 -0.1242 0 117.94 120.05 +2 0 2.5014 0 0 2.7147 0 118.9 117.94 +3 0 3.8987 0 0 -4.4683 0 118.9 120.05 +4 1.9767 1.0239 0 1.9767 1.0239 0 118.9 118.9 +5 0 2.4501 0 0 2.4501 0 117.94 117.94 +6 0 0 0 0 0 0 120.05 111 +7 0 0 0 0 0 0 120.05 108.4 +8 0 0 0 0 0 0 108.4 124.88 +9 -1.8911 3.254 0 -1.8911 3.254 0 124.88 124.88 +10 0 3.4601 0 0 -0.1242 0 117.94 120.05 +11 0 3.8987 0 0 -4.4683 0 118.9 120.05 +12 0.2251 0.6548 0.1237 4.6266 0.1632 0.0461 120.05 111 +13 0 0 0 0 0 0 120.05 108.4 +14 0 0 0 0 0 0 108.4 110.77 +15 -0.8085 0.5569 -0.2466 -0.8085 0.5569 -0.2466 110.77 110.77 +16 -0.2454 0 -0.1136 0.3113 0.4516 -0.1988 112.67 110.77 +17 0 0 0 0 0 0 108.4 112.67 +18 0.3113 0.4516 -0.1988 -0.2454 0 -0.1136 110.77 112.67 +19 0.3886 -0.3139 0.1389 0.3886 -0.3139 0.1389 112.67 112.67 +20 0 3.4601 0 0 -0.1242 0 117.94 120.05 +21 0 3.8987 0 0 -4.4683 0 118.9 120.05 +22 0.2251 0.6548 0.1237 4.6266 0.1632 0.0461 120.05 111 +23 0 0 0 0 0 0 120.05 108.4 +24 0 0 0 0 0 0 108.4 110.77 +25 0 0 0 0 0 0 108.4 112.67 +26 -0.8085 0.5569 -0.2466 -0.8085 0.5569 -0.2466 110.77 110.77 +27 0.3113 0.4516 -0.1988 -0.2454 0 -0.1136 110.77 112.67 +28 0 0 0 0 0 0 112.67 108.4 +29 -0.2454 0 -0.1136 0.3113 0.4516 -0.1988 112.67 110.77 +30 0.3886 -0.3139 0.1389 0.3886 -0.3139 0.1389 112.67 112.67 +31 0 0 0 0 0 0 110.77 108.4 +32 -0.8085 0.5569 -0.2466 -0.8085 0.5569 -0.2466 110.77 110.77 +33 0.3113 0.4516 -0.1988 -0.2454 0 -0.1136 110.77 112.67 +34 0 0 0 0 0 0 112.67 108.4 +35 -0.2454 0 -0.1136 0.3113 0.4516 -0.1988 112.67 110.77 +36 0.3886 -0.3139 0.1389 0.3886 -0.3139 0.1389 112.67 112.67 + +Improper Coeffs # class2 + +1 4.8912 0 +2 7.8153 0 +3 0 0 +4 2.8561 0 +5 7.8153 0 +6 0 0 +7 0 0 +8 7.8153 0 +9 0 0 + +AngleAngle Coeffs + +1 0 0 0 118.9 117.94 117.94 +2 0 0 0 118.9 120.05 120.05 +3 0 0 0 111 124.88 108.4 +4 0 0 0 115.49 124.88 124.88 +5 0 0 0 118.9 120.05 120.05 +6 0 0 0 107.66 110.77 110.77 +7 0 0 0 111 110.77 108.4 +8 0 0 0 118.9 120.05 120.05 +9 0 0 0 111 110.77 108.4 + +Atoms # full + +44 1 2 3.5400000000000001e-02 6.1476397222913839e+01 8.2376490601205234e+01 6.0906939115836181e+01 +45 1276 2 3.5400000000000001e-02 5.8398688202244472e+01 8.0172948526664996e+01 6.2115536813582672e+01 +46 1276 6 -6.9599999999999995e-02 5.9489073989392523e+01 8.0264057167571636e+01 6.1984002598976552e+01 +48 1276 2 3.5400000000000001e-02 5.9675170230342431e+01 8.0048052449390738e+01 6.0920159395372401e+01 +47 1276 2 1.2370000000000000e-01 5.9297455513100488e+01 8.3187777608476154e+01 5.9645157256520122e+01 +18 1 5 -1.8200000000000001e-02 6.2426251430535707e+01 8.2055473568260709e+01 6.2971661388612958e+01 +19 1 6 -6.9599999999999995e-02 6.1399255844467369e+01 8.1794665295860213e+01 6.1821819828185660e+01 +21 1 1 -1.2900000000000000e-01 6.4032918371445831e+01 8.0190179089286701e+01 6.3021564712316334e+01 +22 1 1 2.6599999999999999e-02 6.3672975135915053e+01 8.1418558650051665e+01 6.2448012627881994e+01 +23 1 2 3.5400000000000001e-02 6.1545198223694939e+01 8.0836309422842305e+01 6.1349823957467130e+01 +27 1276 2 5.1600000000000000e-02 5.9809503696580933e+01 8.1831265916389881e+01 6.3253745193271065e+01 +28 1276 5 -1.8200000000000001e-02 5.9900307947967441e+01 8.1677453781363639e+01 6.2190757403657820e+01 +31 1276 2 1.2370000000000000e-01 5.8050043823867973e+01 8.2698312265456622e+01 6.3667111329534436e+01 +38 1 2 1.2370000000000000e-01 6.3754126973935612e+01 7.9931147303963002e+01 6.4022259163067275e+01 +20 1 2 1.2370000000000000e-01 6.4070158368422781e+01 8.2950071388392274e+01 6.1042631212883315e+01 +24 1 1 -1.2900000000000000e-01 6.4337973861569580e+01 8.1916618276489871e+01 6.1387866780102470e+01 +37 1 2 1.4030000000000001e-01 6.5360115866618415e+01 7.8586112104863830e+01 6.3004997314380716e+01 +39 1 1 -1.7340000000000000e-01 6.5018338085325610e+01 7.9478260591306125e+01 6.2440745569712817e+01 +40 1 1 -1.1340000000000000e-01 6.5628759887796605e+01 7.9941156332165264e+01 6.1248476296558067e+01 +41 1 1 -1.7340000000000000e-01 6.5247995680260402e+01 8.1172439250598345e+01 6.0753045571239831e+01 +42 1 2 1.2880000000000000e-01 6.6569600059599281e+01 7.9514748976494360e+01 6.0810611807135601e+01 +43 1 2 1.4030000000000001e-01 6.5780165393063371e+01 8.1570974991007958e+01 5.9850915261812396e+01 +9 1276 2 1.2880000000000000e-01 5.5651795605743445e+01 8.5074472139235127e+01 6.1094480497979262e+01 +30 1276 2 1.4030000000000001e-01 5.6082982679196888e+01 8.3912863624076010e+01 6.3351889697403472e+01 +33 1276 1 -1.7340000000000000e-01 5.6718133911388506e+01 8.3758479063002000e+01 6.2493293749545209e+01 +34 1276 1 -1.1340000000000000e-01 5.6498352105218459e+01 8.4426576393179090e+01 6.1290147608586011e+01 +6 3822 1 -1.7340000000000000e-01 6.3308103537340351e+01 8.7713509787622499e+01 6.4643082313868433e+01 +7 3822 1 -1.2900000000000000e-01 6.3010291684764312e+01 8.6423650045069493e+01 6.4252844241495922e+01 +8 3822 2 1.2370000000000000e-01 6.2089199187020355e+01 8.6309198636296912e+01 6.3711263099850854e+01 +10 1276 2 1.4030000000000001e-01 5.7266131308654970e+01 8.4599328362003035e+01 5.9281511478144402e+01 +11 3822 2 3.5400000000000001e-02 6.1694306618059791e+01 8.3823470438280594e+01 6.3778953909925114e+01 +12 3822 5 -1.8200000000000001e-02 6.3814926998838651e+01 8.3900077798460728e+01 6.4108991789590448e+01 +13 3822 6 -6.9599999999999995e-02 6.2604540882379787e+01 8.3491998603381077e+01 6.3249610918984622e+01 +14 3822 2 1.2370000000000000e-01 6.5739253131027880e+01 8.4813736128157771e+01 6.5351692111169555e+01 +15 3822 1 -1.2900000000000000e-01 6.5071144269009466e+01 8.5646783550482454e+01 6.5086813218945636e+01 +16 3822 1 2.6599999999999999e-02 6.3957099792282079e+01 8.5375816595044753e+01 6.4385073943729708e+01 +17 1 2 5.1600000000000000e-02 6.2256484483973310e+01 8.1576962161157596e+01 6.3963984654065122e+01 +26 3822 2 5.1600000000000000e-02 6.4196825763126355e+01 8.3291442832977836e+01 6.4907094488854057e+01 +29 1276 1 2.6599999999999999e-02 5.8784742332505303e+01 8.2766055380197670e+01 6.1667239692876961e+01 +32 1276 1 -1.2900000000000000e-01 5.7836199787435064e+01 8.3005060229118428e+01 6.2669788306756018e+01 +35 1276 1 -1.2900000000000000e-01 5.8572661840325132e+01 8.3404075689965083e+01 6.0443288532625175e+01 +36 1276 1 -1.7340000000000000e-01 5.7380616699226330e+01 8.4134680429976896e+01 6.0248710539932475e+01 +25 3822 2 3.5400000000000001e-02 6.2750675036816460e+01 8.3891633300878468e+01 6.2249429178485677e+01 +5 3822 2 1.4030000000000001e-01 6.2626160082050376e+01 8.8416565740835182e+01 6.4093918967496805e+01 +1 3822 2 1.2880000000000000e-01 6.4863557606529355e+01 8.9096029197548390e+01 6.5342927535537825e+01 +2 3822 1 -1.1340000000000000e-01 6.4627442641031166e+01 8.8047381925321190e+01 6.5138073202291650e+01 +3 3822 2 1.4030000000000001e-01 6.6470254992065406e+01 8.6991893750821745e+01 6.5857474890608984e+01 +4 3822 1 -1.7340000000000000e-01 6.5416488888088338e+01 8.6963894801200169e+01 6.5357641085394278e+01 + +Velocities + +44 -1.1274099342391698e-02 2.8614364731871914e-02 7.8116535486555949e-03 +45 2.3164382404151666e-03 3.9815732957733160e-03 -2.9971878581527899e-02 +46 -7.1653099619954563e-03 4.5491360587300133e-04 4.9898614093692017e-03 +48 9.8069086061434527e-03 4.0008139512159270e-03 6.2934259772882122e-03 +47 2.2646445306743783e-03 1.3029071608409702e-03 4.2232440120174040e-02 +18 7.0040064100195757e-03 3.2877451206009701e-03 -3.5376010407568422e-04 +19 -1.3998188760009689e-02 7.2238210565990146e-03 7.7956220633332383e-03 +21 3.1954292320462373e-03 -2.9717583309420764e-03 -3.1753395094325522e-03 +22 5.2997643939121201e-03 -2.9646963088534335e-03 -4.1351926198204894e-03 +23 7.6443400078766528e-03 4.0358953976530103e-02 -2.6684706183248367e-02 +27 1.9261652416455359e-03 -1.1632914130150688e-02 1.0061732021630769e-02 +28 -8.2251676802878315e-03 -1.5111873066969876e-04 1.3808893565582731e-02 +31 5.2475840572179860e-03 1.8266996572138715e-02 2.3453280610166885e-03 +38 -2.0343905130199073e-02 3.2815536859276281e-02 3.6511922534330152e-03 +20 2.2914549087537126e-02 1.4424503744223915e-02 2.1708279654645496e-03 +24 -2.4717233344142471e-03 1.2966123098719246e-02 8.1261459853411936e-03 +37 -2.4547379584186218e-02 -3.0213966592845171e-02 -3.1437442951939183e-02 +39 2.5476117829076835e-03 1.2743160680987653e-03 1.8775880208113892e-03 +40 -6.9216508143939990e-03 1.0986173624795060e-02 8.4543093049661480e-03 +41 -6.9641432145561661e-03 3.4497795547843439e-03 -6.5914679936187716e-03 +42 -1.6682931637687005e-02 -7.9952140358728052e-03 -5.4993265930488526e-02 +43 -1.2747392921213267e-03 -8.9033092043203244e-03 -1.4285400545629027e-02 +9 -4.6235166357676289e-03 -1.3071850427027999e-02 -1.4097407987100977e-02 +30 -1.0949617396609294e-02 2.8255703113196974e-03 1.7171748232322353e-02 +33 -6.1375812469323665e-03 -2.4748644899411924e-03 -9.4761978149296138e-03 +34 1.3676079846441525e-03 5.6076140293943458e-03 4.3217204641336267e-03 +6 -1.0264635053701928e-02 6.5278337056107680e-03 7.0056151148588212e-04 +7 -8.7519451205145676e-03 -4.6476440106580945e-03 2.5970484253527112e-03 +8 2.1377395557690311e-02 -3.3261274153819453e-03 -1.0112266596677577e-02 +10 -3.5793767912309253e-02 -4.7139872292323019e-02 -1.6709528481405608e-02 +11 8.5071485795589590e-03 9.9402848610678270e-03 -3.8088596341056854e-03 +12 -7.1678159384257103e-04 -6.9164463557228907e-04 -6.4073519808107186e-03 +13 -4.8443902657902991e-03 -1.1919190682985097e-03 6.3946846087726637e-03 +14 1.4810157483257907e-02 1.9829623839419017e-03 -2.7393844990063056e-02 +15 2.4171850935506777e-03 8.5003135180758520e-03 -1.4373227798951704e-03 +16 2.7567342910947553e-03 4.7168484476890456e-03 -5.5131873288712992e-03 +17 -3.8456662730386774e-02 2.0220106671151108e-02 -1.3822049134399602e-02 +26 2.7415414728694614e-02 1.4392155257037418e-03 -6.7281635499082748e-03 +29 2.8284983560440745e-03 2.8809942505517976e-03 -9.0489583066552114e-04 +32 -3.8543634697614316e-03 4.6751647301899795e-03 4.2171867397204537e-03 +35 -8.6957974827209118e-03 -4.4615282666186267e-04 -2.6571026120482824e-03 +36 9.4881057996863086e-04 -7.5665878069688429e-03 2.0333670960646154e-03 +25 1.8105924111310519e-02 -8.6933495274689535e-03 -1.9695291360338044e-04 +5 -5.0447438383189585e-03 -4.5665146331657552e-02 1.0653751333175230e-02 +1 -1.7372868398038824e-02 -2.3625357536259349e-03 1.2220266128368908e-02 +2 3.7050246021929395e-03 -1.0236943515935205e-03 7.2206774682170580e-03 +3 2.3669435799326944e-02 2.7891427939155597e-02 -6.7091036888174346e-03 +4 3.4910623999263577e-03 2.6370880132825258e-03 -6.4694788112864129e-03 + +Bonds + +1 10 44 19 +2 10 45 46 +3 10 48 46 +4 9 19 18 +5 1 21 38 +6 2 21 22 +7 2 21 39 +8 7 22 18 +9 2 22 24 +10 10 23 19 +11 8 27 28 +12 9 28 46 +13 9 28 19 +14 1 24 20 +15 2 24 41 +16 1 39 37 +17 1 40 42 +18 2 40 39 +19 1 41 43 +20 2 41 40 +21 1 33 30 +22 1 34 9 +23 2 34 33 +24 1 6 5 +25 2 6 2 +26 1 7 8 +27 2 7 6 +28 10 11 13 +29 13 12 13 +30 9 13 18 +31 1 15 14 +32 2 15 16 +33 2 15 4 +34 11 16 12 +35 2 16 7 +36 8 17 18 +37 12 26 12 +38 7 29 28 +39 2 29 35 +40 1 32 31 +41 2 32 29 +42 2 32 33 +43 1 35 47 +44 2 35 36 +45 1 36 10 +46 2 36 34 +47 10 25 13 +48 1 2 1 +49 2 2 4 +50 1 4 3 + +Angles + +1 14 45 46 28 +2 14 48 46 28 +3 15 45 46 48 +4 11 22 18 13 +5 12 17 18 13 +6 13 13 18 19 +7 10 22 18 17 +8 11 22 18 19 +9 12 17 18 19 +10 16 28 19 18 +11 14 44 19 28 +12 14 23 19 28 +13 14 44 19 18 +14 14 23 19 18 +15 15 44 19 23 +16 1 22 21 38 +17 1 39 21 38 +18 2 22 21 39 +19 9 21 22 18 +20 2 21 22 24 +21 9 24 22 18 +22 10 29 28 27 +23 11 29 28 46 +24 11 29 28 19 +25 12 27 28 46 +26 12 27 28 19 +27 13 46 28 19 +28 1 22 24 20 +29 2 22 24 41 +30 1 41 24 20 +31 2 21 39 40 +32 1 21 39 37 +33 1 40 39 37 +34 1 41 40 42 +35 2 41 40 39 +36 1 39 40 42 +37 1 24 41 43 +38 2 24 41 40 +39 1 40 41 43 +40 2 32 33 34 +41 1 32 33 30 +42 1 34 33 30 +43 1 36 34 9 +44 2 36 34 33 +45 1 33 34 9 +46 1 7 6 5 +47 2 7 6 2 +48 1 2 6 5 +49 1 16 7 8 +50 2 16 7 6 +51 1 6 7 8 +52 18 16 12 26 +53 19 16 12 13 +54 20 26 12 13 +55 21 25 13 12 +56 21 11 13 12 +57 22 12 13 18 +58 15 25 13 11 +59 14 25 13 18 +60 14 11 13 18 +61 1 16 15 14 +62 1 4 15 14 +63 2 16 15 4 +64 17 15 16 12 +65 2 15 16 7 +66 17 7 16 12 +67 9 32 29 28 +68 2 32 29 35 +69 9 35 29 28 +70 1 29 32 31 +71 1 33 32 31 +72 2 29 32 33 +73 1 29 35 47 +74 2 29 35 36 +75 1 36 35 47 +76 1 35 36 10 +77 2 35 36 34 +78 1 34 36 10 +79 1 6 2 1 +80 2 6 2 4 +81 1 4 2 1 +82 2 15 4 2 +83 1 15 4 3 +84 1 2 4 3 + +Dihedrals + +1 34 18 19 28 29 +2 31 44 19 28 29 +3 31 23 19 28 29 +4 35 18 19 28 27 +5 32 44 19 28 27 +6 32 23 19 28 27 +7 36 18 19 28 46 +8 33 44 19 28 46 +9 33 23 19 28 46 +10 36 28 19 18 13 +11 33 44 19 18 13 +12 33 23 19 18 13 +13 34 28 19 18 22 +14 31 44 19 18 22 +15 31 23 19 18 22 +16 35 28 19 18 17 +17 32 44 19 18 17 +18 32 23 19 18 17 +19 10 38 21 22 18 +20 11 39 21 22 18 +21 4 39 21 22 24 +22 5 38 21 39 37 +23 4 22 21 39 40 +24 2 22 21 39 37 +25 2 24 22 21 38 +26 13 21 22 18 13 +27 12 21 22 18 17 +28 13 21 22 18 19 +29 13 24 22 18 13 +30 12 24 22 18 17 +31 13 24 22 18 19 +32 2 21 22 24 20 +33 4 21 22 24 41 +34 14 29 28 46 45 +35 14 29 28 46 48 +36 15 27 28 46 45 +37 15 27 28 46 48 +38 16 19 28 46 45 +39 16 19 28 46 48 +40 10 20 24 22 18 +41 11 41 24 22 18 +42 2 22 24 41 43 +43 4 22 24 41 40 +44 5 20 24 41 43 +45 2 40 39 21 38 +46 2 21 39 40 42 +47 2 39 40 41 43 +48 4 41 40 39 21 +49 2 41 40 39 37 +50 5 42 40 39 37 +51 2 40 41 24 20 +52 2 24 41 40 42 +53 4 24 41 40 39 +54 5 43 41 40 42 +55 2 34 33 32 31 +56 2 32 33 34 9 +57 2 33 34 36 10 +58 4 36 34 33 32 +59 2 36 34 33 30 +60 5 9 34 33 30 +61 2 2 6 7 8 +62 2 7 6 2 1 +63 4 7 6 2 4 +64 5 5 6 2 1 +65 20 8 7 16 12 +66 21 6 7 16 12 +67 2 16 7 6 5 +68 4 16 7 6 2 +69 5 8 7 6 5 +70 24 16 12 13 25 +71 24 16 12 13 11 +72 25 16 12 13 18 +73 26 26 12 13 25 +74 26 26 12 13 11 +75 27 26 12 13 18 +76 28 12 13 18 22 +77 29 12 13 18 17 +78 30 12 13 18 19 +79 31 25 13 18 22 +80 32 25 13 18 17 +81 33 25 13 18 19 +82 31 11 13 18 22 +83 32 11 13 18 17 +84 33 11 13 18 19 +85 20 14 15 16 12 +86 21 4 15 16 12 +87 4 4 15 16 7 +88 5 14 15 4 3 +89 4 16 15 4 2 +90 2 16 15 4 3 +91 2 7 16 15 14 +92 22 15 16 12 26 +93 23 15 16 12 13 +94 22 7 16 12 26 +95 23 7 16 12 13 +96 2 15 16 7 8 +97 4 15 16 7 6 +98 2 35 29 32 31 +99 12 32 29 28 27 +100 13 32 29 28 46 +101 13 32 29 28 19 +102 12 35 29 28 27 +103 13 35 29 28 46 +104 13 35 29 28 19 +105 2 32 29 35 47 +106 4 32 29 35 36 +107 10 31 32 29 28 +108 11 33 32 29 28 +109 4 33 32 29 35 +110 5 31 32 33 30 +111 4 29 32 33 34 +112 2 29 32 33 30 +113 10 47 35 29 28 +114 11 36 35 29 28 +115 2 29 35 36 10 +116 4 29 35 36 34 +117 5 47 35 36 10 +118 2 34 36 35 47 +119 2 35 36 34 9 +120 4 35 36 34 33 +121 5 10 36 34 9 +122 2 4 2 6 5 +123 4 6 2 4 15 +124 2 6 2 4 3 +125 5 1 2 4 3 +126 2 2 4 15 14 +127 2 15 4 2 1 + +Impropers + +1 6 45 46 48 28 +2 1 22 18 17 13 +3 1 22 18 13 19 +4 1 17 18 13 19 +5 1 22 18 17 19 +6 1 44 19 18 28 +7 1 23 19 18 28 +8 1 44 19 23 28 +9 1 44 19 23 18 +10 1 22 21 39 38 +11 5 21 22 24 18 +12 1 29 28 27 46 +13 1 29 28 27 19 +14 1 29 28 46 19 +15 1 27 28 46 19 +16 1 22 24 41 20 +17 1 21 39 40 37 +18 1 41 40 39 42 +19 1 24 41 40 43 +20 1 32 33 34 30 +21 1 36 34 33 9 +22 1 7 6 2 5 +23 1 16 7 6 8 +24 9 16 12 26 13 +25 1 25 13 11 12 +26 1 25 13 12 18 +27 1 11 13 12 18 +28 1 25 13 11 18 +29 1 16 15 4 14 +30 8 15 16 7 12 +31 5 32 29 35 28 +32 1 29 32 33 31 +33 1 29 35 36 47 +34 1 35 36 34 10 +35 1 6 2 4 1 +36 1 15 4 2 3 diff --git a/lib/kokkos/BUILD.md b/lib/kokkos/BUILD.md index 7a7e2a8e05..e1f0e3e472 100644 --- a/lib/kokkos/BUILD.md +++ b/lib/kokkos/BUILD.md @@ -65,10 +65,15 @@ which activates the OpenMP backend. All of the options controlling device backen ## Spack An alternative to manually building with the CMake is to use the Spack package manager. -To do so, download the `kokkos-spack` git repo and add to the package list: +Make sure you have downloaded [Spack](https://github.com/spack/spack). +The easiest way to configure the Spack environment is: ````bash -> spack repo add $path-to-kokkos-spack +> source spack/share/spack/setup-env.sh ```` +with other scripts available for other shells. +You can display information about how to install packages with: +````bash +> spack info kokkos A basic installation would be done as: ````bash > spack install kokkos @@ -178,8 +183,8 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`. ## Other Options * Kokkos_CXX_STANDARD - * The C++ standard for Kokkos to use: c++11, c++14, c++17, or c++20. This should be given in CMake style as 11, 14, 17, or 20. - * STRING Default: 11 + * The C++ standard for Kokkos to use: c++14, c++17, or c++20. This should be given in CMake style as 14, 17, or 20. + * STRING Default: 14 ## Third-party Libraries (TPLs) The following options control enabling TPLs: diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index d8baea4c49..c759181aa2 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,111 @@ # Change Log +## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01) + +**Bug Fixes:** +- Fix severe performance bug in DualView which added memcpys for sync and modify [\#3693](https://github.com/kokkos/kokkos/issues/#3693) +- Fix performance bug in CUDA backend, where the cuda Cache config was not set correct. + +## [3.3.00](https://github.com/kokkos/kokkos/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.01...3.3.00) + +**Features:** +- Require C++14 as minimum C++ standard. C++17 and C++20 are supported too. +- HIP backend is nearly feature complete. Kokkos Dynamic Task Graphs are missing. +- Major update for OpenMPTarget: many capabilities now work. For details contact us. +- Added DPC++/SYCL backend: primary capabilites are working. +- Added Kokkos Graph API analogous to CUDA Graphs. +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) + +**Implemented enhancements Backends and Archs:** +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) + +**Implemented enhancements Policies:** +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) + +**Implemented enhancements BuildSystem:** +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) + +**Implemented enhancements Tools:** +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) + +**Implemented enhancements Other:** +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) + +**Fixed bugs:** +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) + +**Incompatibilities:** +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) + ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) @@ -36,37 +142,31 @@ - Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018) - Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241) - Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 1: [\#3202](https://github.com/kokkos/kokkos/issues/3202) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 2: [\#3203](https://github.com/kokkos/kokkos/issues/3203) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 3: [\#3196](https://github.com/kokkos/kokkos/issues/3196) +- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable: [\#3202](https://github.com/kokkos/kokkos/issues/3202) , [\#3203](https://github.com/kokkos/kokkos/issues/3203) , [\#3196](https://github.com/kokkos/kokkos/issues/3196) - Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189) - Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187) -- Improve support for nvcc\_wrapper with exotic host compiler [\#3186](https://github.com/kokkos/kokkos/issues/3186) - Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185) - Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166) - Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165) - View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159) - Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151) -- BuildSystem: Provide an explicit default CMAKE\_BUILD\_TYPE [\#3131](https://github.com/kokkos/kokkos/issues/3131) +- BuildSystem: Improved performance in default configuration by defaulting to Release build [\#3131](https://github.com/kokkos/kokkos/issues/3131) - Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124) - Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123) - BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122) - General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106) - TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098) -- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092) - BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091) - Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087) - Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084) - BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082) - Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076) - HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067) -- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) -- Extract and use get\_gpu [\#3048](https://github.com/kokkos/kokkos/issues/3048) +- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) , [\#3048](https://github.com/kokkos/kokkos/issues/3048) - Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059) - Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052) - Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051) - Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034) -- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021) - HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020) - Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013) @@ -82,11 +182,13 @@ - ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162) - BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127) - OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101) +- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092) - BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085) - HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080) - Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072) - Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069) - Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035) +- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021) **Incompatibilities:** diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index f7fa3e5279..7bc3c77256 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -111,7 +111,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 2) +set(Kokkos_VERSION_MINOR 3) set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -139,13 +139,15 @@ ENDIF() # I really wish these were regular variables # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) +GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos GLOBAL_SET(KOKKOS_TPL_EXPORTS) +# this could probably be scoped to project +GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS @@ -191,8 +193,6 @@ ELSE() SET(KOKKOS_IS_SUBDIRECTORY FALSE) ENDIF() - - #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for @@ -253,9 +253,7 @@ KOKKOS_PROCESS_SUBPACKAGES() KOKKOS_PACKAGE_DEF() KOKKOS_EXCLUDE_AUTOTOOLS_FILES() KOKKOS_PACKAGE_POSTPROCESS() - -#We are ready to configure the header -CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +KOKKOS_CONFIGURE_CORE() IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) ADD_LIBRARY(kokkos INTERFACE) @@ -272,7 +270,10 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # executables also need nvcc_wrapper. Thus, we need to install it. # If the argument of DESTINATION is a relative path, CMake computes it # as relative to ${CMAKE_INSTALL_PATH}. -INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION ${CMAKE_INSTALL_BINDIR}) +# KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated +# files +KOKKOS_INSTALL_ADDITIONAL_FILES() + # Finally - if we are a subproject - make sure the enabled devices are visible IF (HAS_PARENT) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index d8b5a050bd..061b7a46ee 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,27 +11,27 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 2 +KOKKOS_VERSION_MINOR = 3 KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) -# Options: Cuda,HIP,ROCm,OpenMP,Pthread,Serial +# Options: Cuda,HIP,OpenMP,Pthread,Serial KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthread" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80 -# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2 +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: Vega900,Vega906 +# AMD-GPUS: Vega900,Vega906,Vega908 # AMD-CPUS: AMDAVX,Zen,Zen2 KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" # Options: hwloc,librt,experimental_memkind KOKKOS_USE_TPLS ?= "" -# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a -KOKKOS_CXX_STANDARD ?= "c++11" +# Options: c++14,c++1y,c++17,c++1z,c++2a +KOKKOS_CXX_STANDARD ?= "c++14" # Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align KOKKOS_OPTIONS ?= "" KOKKOS_CMAKE ?= "no" @@ -66,7 +66,6 @@ kokkos_path_exists=$(if $(wildcard $1),1,0) # Check for general settings KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes) -KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11) KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14) KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y) KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17) @@ -279,14 +278,12 @@ else endif endif -# Set C++11 flags. +# Set C++ version flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) - KOKKOS_INTERNAL_CXX11_FLAG := --c++11 KOKKOS_INTERNAL_CXX14_FLAG := --c++14 KOKKOS_INTERNAL_CXX17_FLAG := --c++17 else ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) - KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11 KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y #KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 @@ -294,23 +291,17 @@ else #KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11 KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14 #KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y #KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17 #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a else - ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) - KOKKOS_INTERNAL_CXX11_FLAG := - else - KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11 - KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 - KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y - KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 - KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z - KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a - endif + KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a endif endif endif @@ -377,7 +368,8 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8 KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) -KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2) | bc)) +KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) @@ -392,6 +384,7 @@ KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906) +KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) @@ -459,7 +452,6 @@ H := \# # Do not append first line tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp) tmp := $(call kokkos_append_header,"Makefile constructed configuration:") -tmp := $(call kokkos_append_header,"$(shell date)") tmp := $(call kokkos_append_header,"----------------------------------------------*/") tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)') @@ -479,10 +471,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)") endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) - tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_ROCM') - tmp := $(call kokkos_append_header,'$H''define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1') -endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP') endif @@ -542,12 +530,6 @@ endif #only add the c++ standard flags if this is not CMake tmp := $(call kokkos_append_header,"/* General Settings */") -ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1) -ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG) -endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX11") -endif ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1) ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG) @@ -765,6 +747,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX") + + KOKKOS_CXXFLAGS += -march=armv8.2-a+sve + KOKKOS_LDFLAGS += -march=armv8.2-a+sve +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") @@ -1143,6 +1132,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 908") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908 + endif + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) @@ -1173,6 +1168,55 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) endif +# Functions for generating config header file +kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) +kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3) +kokkos_append_config_header = $(shell echo $1 >> $2)) +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp") +tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + else + endif +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) @@ -1290,7 +1334,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) endif # With Cygwin functions such as fdopen and fileno are not defined -# when strict ansi is enabled. strict ansi gets enabled with --std=c++11 +# when strict ansi is enabled. strict ansi gets enabled with --std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects # This is needed for gtest actually, not for Kokkos itself! ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) @@ -1313,7 +1357,9 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: - rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a + rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ + KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 525962d2d5..5a03f7d17e 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -53,23 +53,10 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp -Kokkos_HIP_KernelLaunch.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) -Kokkos_ROCm_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp -Kokkos_ROCm_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp -Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp -Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp -endif - ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index b67830fde4..d55ef2caac 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -54,24 +54,16 @@ For specifics see the LICENSE file contained in the repository or distribution. # Requirements ### Primary tested compilers on X86 are: -* GCC 4.8.4 -* GCC 4.9.3 -* GCC 5.1.0 +* GCC 5.3.0 * GCC 5.4.0 * GCC 5.5.0 * GCC 6.1.0 * GCC 7.2.0 * GCC 7.3.0 * GCC 8.1.0 -* Intel 15.0.2 -* Intel 16.0.1 * Intel 17.0.1 * Intel 17.4.196 * Intel 18.2.128 -* Clang 3.6.1 -* Clang 3.7.1 -* Clang 3.8.1 -* Clang 3.9.0 * Clang 4.0.0 * Clang 6.0.0 for CUDA (CUDA Toolkit 9.0) * Clang 7.0.0 for CUDA (CUDA Toolkit 9.1) @@ -81,6 +73,7 @@ For specifics see the LICENSE file contained in the repository or distribution. * NVCC 9.2 for CUDA (with gcc 7.2.0) * NVCC 10.0 for CUDA (with gcc 7.4.0) * NVCC 10.1 for CUDA (with gcc 7.4.0) +* NVCC 11.0 for CUDA (with gcc 8.4.0) ### Primary tested compilers on Power 8 are: * GCC 6.4.0 (OpenMP,Serial) @@ -89,9 +82,8 @@ For specifics see the LICENSE file contained in the repository or distribution. * NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0) ### Primary tested compilers on Intel KNL are: -* Intel 16.4.258 (with gcc 4.7.2) -* Intel 17.2.174 (with gcc 4.9.3) -* Intel 18.2.199 (with gcc 4.9.3) +* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0) +* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0) ### Primary tested compilers on ARM (Cavium ThunderX2) * GCC 7.2.0 diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 40d8db2663..69d6cf8f35 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -806,7 +806,7 @@ class Random_XorShift64 { const double V = 2.0 * drand() - 1.0; S = U * U + V * V; } - return U * std::sqrt(-2.0 * log(S) / S); + return U * std::sqrt(-2.0 * std::log(S) / S); } KOKKOS_INLINE_FUNCTION @@ -1042,7 +1042,7 @@ class Random_XorShift1024 { const double V = 2.0 * drand() - 1.0; S = U * U + V * V; } - return U * std::sqrt(-2.0 * log(S) / S); + return U * std::sqrt(-2.0 * std::log(S) / S); } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index a95b652eab..d17c02776f 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -222,12 +222,12 @@ class BinSort { "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); bin_count_const = bin_count_atomic; bin_offsets = - offset_type(ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), bin_op.max_bins()); sort_order = - offset_type(ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::sort_order"), + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sort_order"), range_end - range_begin); } @@ -279,8 +279,8 @@ class BinSort { } scratch_view_type sorted_values( - ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::sorted_values"), + view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sorted_values"), values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 969e67c41b..819c9e54ba 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -24,7 +24,7 @@ KOKKOS_ADD_TEST_LIBRARY( # avoid deprecation warnings from MSVC TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) -IF(NOT (Kokkos_ENABLE_CUDA AND WIN32)) +IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11) ENDIF() diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index 4a192b08ec..c112d7c6fc 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -31,10 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) - OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o - TARGETS += KokkosAlgorithms_UnitTest_ROCm - TEST_TARGETS += test-rocm +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + OBJ_HIP = TestHIP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_HIP + TEST_TARGETS += test-hip endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) @@ -64,8 +64,8 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda -KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm +KokkosAlgorithms_UnitTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HIP KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -82,8 +82,8 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda -test-rocm: KokkosAlgorithms_UnitTest_ROCm - ./KokkosAlgorithms_UnitTest_ROCm +test-hip: KokkosAlgorithms_UnitTest_HIP + ./KokkosAlgorithms_UnitTest_HIP test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/benchmarks/atomic/Makefile b/lib/kokkos/benchmarks/atomic/Makefile index 64b43917de..636c0ad4ab 100644 --- a/lib/kokkos/benchmarks/atomic/Makefile +++ b/lib/kokkos/benchmarks/atomic/Makefile @@ -1,31 +1,38 @@ -KOKKOS_PATH = ${HOME}/kokkos -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" -EXE_NAME = "test" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = $(wildcard *.cpp) + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper -EXE = ${EXE_NAME}.cuda -KOKKOS_CUDA_OPTIONS = "enable_lambda" +EXE = atomic_perf.cuda else CXX = g++ -EXE = ${EXE_NAME}.host +EXE = atomic_perf.exe endif -CXXFLAGS = -O3 - -LINK = ${CXX} -LINKFLAGS = -O3 +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cpp=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -35,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o *.cuda *.host +clean: kokkos-clean + rm -f *.o atomic_perf.cuda atomic_perf.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash index 9dded535e8..4fcac3df9f 100755 --- a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash @@ -9,7 +9,7 @@ if [[ ${USE_CUDA} > 0 ]]; then BAF_EXE=bytes_and_flops.cuda TEAM_SIZE=256 else - BAF_EXE=bytes_and_flops.host + BAF_EXE=bytes_and_flops.exe TEAM_SIZE=1 fi diff --git a/lib/kokkos/benchmarks/bytes_and_flops/Makefile b/lib/kokkos/benchmarks/bytes_and_flops/Makefile index 6cbef56ff0..1aa4edddcd 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/Makefile +++ b/lib/kokkos/benchmarks/bytes_and_flops/Makefile @@ -1,6 +1,6 @@ KOKKOS_DEVICES=Cuda KOKKOS_CUDA_OPTIONS=enable_lambda -KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_ARCH = "SNB,Volta70" MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) @@ -22,7 +22,7 @@ CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper EXE = bytes_and_flops.cuda else CXX = g++ -EXE = bytes_and_flops.host +EXE = bytes_and_flops.exe endif CXXFLAGS ?= -O3 -g diff --git a/lib/kokkos/benchmarks/gather/Makefile b/lib/kokkos/benchmarks/gather/Makefile index 0ea9fb1dd2..6827995bed 100644 --- a/lib/kokkos/benchmarks/gather/Makefile +++ b/lib/kokkos/benchmarks/gather/Makefile @@ -1,7 +1,18 @@ -KOKKOS_PATH = ${HOME}/kokkos -SRC = $(wildcard *.cpp) KOKKOS_DEVICES=Cuda KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" @@ -9,36 +20,32 @@ default: build ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper EXE = gather.cuda -KOKKOS_DEVICES = "Cuda,OpenMP" -KOKKOS_ARCH = "SNB,Kepler35" else CXX = g++ -EXE = gather.host -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" +EXE = gather.exe endif -CXXFLAGS = -O3 -g +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M LINK = ${CXX} LINKFLAGS = -OBJ = $(SRC:.cpp=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos -$(warning ${KOKKOS_CPPFLAGS}) build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o gather.cuda gather.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/gups/Makefile b/lib/kokkos/benchmarks/gups/Makefile index 7176111664..2a90621d8c 100644 --- a/lib/kokkos/benchmarks/gups/Makefile +++ b/lib/kokkos/benchmarks/gups/Makefile @@ -1,28 +1,38 @@ -#Set your Kokkos path to something appropriate -KOKKOS_PATH = ${HOME}/git/kokkos-github-repo -KOKKOS_DEVICES = "Cuda" -KOKKOS_ARCH = "Pascal60" -KOKKOS_CUDA_OPTIONS = enable_lambda -#KOKKOS_DEVICES = "OpenMP" -#KOKKOS_ARCH = "Power8" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = gups-kokkos.cc + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - -CXXFLAGS = -O3 -CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper -#CXX = g++ -LINK = ${CXX} +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = gups.cuda +else +CXX = g++ +EXE = gups.exe +endif -LINKFLAGS = -EXE = gups-kokkos +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cc=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -32,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o $(EXE) +clean: kokkos-clean + rm -f *.o gups.cuda gups.exe # Compilation rules -%.o:%.cc $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/gups/gups-kokkos.cc b/lib/kokkos/benchmarks/gups/gups-kokkos.cpp similarity index 100% rename from lib/kokkos/benchmarks/gups/gups-kokkos.cc rename to lib/kokkos/benchmarks/gups/gups-kokkos.cpp diff --git a/lib/kokkos/benchmarks/policy_performance/Makefile b/lib/kokkos/benchmarks/policy_performance/Makefile index 13aef3209c..f50aea720e 100644 --- a/lib/kokkos/benchmarks/policy_performance/Makefile +++ b/lib/kokkos/benchmarks/policy_performance/Makefile @@ -1,31 +1,38 @@ -KOKKOS_PATH = ../.. -SRC = $(wildcard *.cpp) +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper -CXXFLAGS = -O3 -g -LINK = ${CXX} -LINKFLAGS = -EXE = policy_performance.cuda -KOKKOS_DEVICES = "Cuda,OpenMP" -KOKKOS_ARCH = "SNB,Kepler35" -KOKKOS_CUDA_OPTIONS+=enable_lambda +EXE = policy_perf.cuda else CXX = g++ -CXXFLAGS = -O3 -g -Wall -Werror -LINK = ${CXX} -LINKFLAGS = -EXE = policy_performance.host -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" +EXE = policy_perf.exe endif -DEPFLAGS = -M +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) -OBJ = $(SRC:.cpp=.o) +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -35,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o *.cuda *.host +clean: kokkos-clean + rm -f *.o policy_perf.cuda policy_perf.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp index 5b04c6ab93..da49cdb019 100644 --- a/lib/kokkos/benchmarks/policy_performance/main.cpp +++ b/lib/kokkos/benchmarks/policy_performance/main.cpp @@ -146,11 +146,11 @@ int main(int argc, char* argv[]) { // Call a 'warmup' test with 1 repeat - this will initialize the corresponding // view appropriately for test and should obey first-touch etc Second call to // test is the one we actually care about and time - view_type_1d v_1(Kokkos::ViewAllocateWithoutInitializing("v_1"), + view_type_1d v_1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_1"), team_range * team_size); - view_type_2d v_2(Kokkos::ViewAllocateWithoutInitializing("v_2"), + view_type_2d v_2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_2"), team_range * team_size, thread_range); - view_type_3d v_3(Kokkos::ViewAllocateWithoutInitializing("v_3"), + view_type_3d v_3(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_3"), team_range * team_size, thread_range, vector_range); double result_computed = 0.0; diff --git a/lib/kokkos/benchmarks/stream/Makefile b/lib/kokkos/benchmarks/stream/Makefile index 04566b322d..47a13838a4 100644 --- a/lib/kokkos/benchmarks/stream/Makefile +++ b/lib/kokkos/benchmarks/stream/Makefile @@ -1,28 +1,38 @@ -#Set your Kokkos path to something appropriate -KOKKOS_PATH = ${HOME}/git/kokkos-github-repo -#KOKKOS_DEVICES = "Cuda" -#KOKKOS_ARCH = "Pascal60" -#KOKKOS_CUDA_OPTIONS = enable_lambda -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "Power8" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = stream-kokkos.cc + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - -CXXFLAGS = -O3 -#CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = stream.cuda +else CXX = g++ +EXE = stream.exe +endif -LINK = ${CXX} - -LINKFLAGS = -EXE = stream-kokkos +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cc=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -32,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o $(EXE) +clean: kokkos-clean + rm -f *.o stream.cuda stream.exe # Compilation rules -%.o:%.cc $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/stream/stream-kokkos.cc b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp similarity index 100% rename from lib/kokkos/benchmarks/stream/stream-kokkos.cc rename to lib/kokkos/benchmarks/stream/stream-kokkos.cpp diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler new file mode 100755 index 0000000000..1fbebf648f --- /dev/null +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -0,0 +1,87 @@ +#!/bin/bash -e +# +# This script allows CMAKE_CXX_COMPILER to be a standard +# C++ compiler and Kokkos sets RULE_LAUNCH_COMPILE and +# RULE_LAUNCH_LINK in CMake so that all compiler and link +# commands are prefixed with this script followed by the +# C++ compiler. Thus if $1 == $2 then we know the command +# was intended for the C++ compiler and we discard both +# $1 and $2 and redirect the command to NVCC_WRAPPER. +# If $1 != $2 then we know that the command was not intended +# for the C++ compiler and we just discard $1 and launch +# the original command. Examples of when $2 will not equal +# $1 are 'ar', 'cmake', etc. during the linking phase +# + +# check the arguments for the KOKKOS_DEPENDENCE compiler definition +KOKKOS_DEPENDENCE=0 +for i in ${@} +do + if [ -n "$(echo ${i} | grep 'KOKKOS_DEPENDENCE$')" ]; then + KOKKOS_DEPENDENCE=1 + break + fi +done + +# if C++ is not passed, someone is probably trying to invoke it directly +if [ -z "${1}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n" + exit 1 +fi + +# if there aren't two args, this isn't necessarily invalid, just a bit strange +if [ -z "${2}" ]; then exit 0; fi + +# store the expected C++ compiler +CXX_COMPILER=${1} + +# remove the expected C++ compiler from the arguments +shift + +# after the above shift, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# becomes: +# kokkos_launch_compiler gcc -c file.c -o file.o +# Check to see if the executable is the C++ compiler and if it is not, then +# just execute the command. +# +# Summary: +# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# results in this command being executed: +# gcc -c file.c -o file.o +# and +# kokkos_launch_compiler g++ g++ -c file.cpp -o file.o +# results in this command being executed: +# nvcc_wrapper -c file.cpp -o file.o +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then + # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper + eval $@ +else + # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper + + # find the nvcc_wrapper from the same build/install + NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + + if [ -z "${NVCC_WRAPPER}" ]; then + echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n" + exit 1 + fi + + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER + + # calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi + + # discard the compiler from the command + shift + + # execute nvcc_wrapper + ${NVCC_WRAPPER} $@ +fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index bc213497bf..4ecf4c66d5 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -90,7 +90,12 @@ replace_pragma_ident=0 # Mark first host compiler argument first_xcompiler_arg=1 -temp_dir=${TMPDIR:-/tmp} +# Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop) +if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then + temp_dir=${TMPDIR:-/tmp} +else + temp_dir=${NVCC_WRAPPER_TMPDIR+x} +fi # optimization flag added as a command-line argument optimization_flag="" @@ -194,7 +199,7 @@ do cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart) + -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) cuda_args="$cuda_args $1 $2" shift ;; diff --git a/lib/kokkos/cmake/KokkosConfig.cmake.in b/lib/kokkos/cmake/KokkosConfig.cmake.in index 6f4607687e..9fbd22ee5c 100644 --- a/lib/kokkos/cmake/KokkosConfig.cmake.in +++ b/lib/kokkos/cmake/KokkosConfig.cmake.in @@ -1,3 +1,9 @@ +# No need for policy push/pop. CMake also manages a new entry for scripts +# loaded by include() and find_package() commands except when invoked with +# the NO_POLICY_SCOPE option +# CMP0057 + NEW -> IN_LIST operator in IF(...) +CMAKE_POLICY(SET CMP0057 NEW) + # Compute paths @PACKAGE_INIT@ @@ -12,3 +18,18 @@ GET_FILENAME_COMPONENT(Kokkos_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") UNSET(Kokkos_CMAKE_DIR) + +# if CUDA was enabled and separable compilation was specified, e.g. +# find_package(Kokkos COMPONENTS separable_compilation) +# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK +IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper + kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) + # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK + IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND + (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper") + kokkos_compilation(GLOBAL) + ENDIF() + UNSET(IS_NVCC) # be mindful of the environment, pollution is bad +ENDIF() diff --git a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in index 8e664b27a3..42c755c215 100644 --- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -89,3 +89,73 @@ function(kokkos_check) set(${KOKKOS_CHECK_RETURN_VALUE} ${KOKKOS_CHECK_SUCCESS} PARENT_SCOPE) endif() endfunction() + +# this function is provided to easily select which files use nvcc_wrapper: +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +FUNCTION(kokkos_compilation) + CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # search relative first and then absolute + SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() + +# A test to check whether a downstream project set the C++ compiler to NVCC or not +# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON +FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) + # Check if the compiler is nvcc (which really means nvcc_wrapper). + EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RET) + # something went wrong + IF(RET GREATER 0) + SET(${VAR} false PARENT_SCOPE) + ELSE() + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + SET(${VAR} true PARENT_SCOPE) + ELSE() + SET(${VAR} false PARENT_SCOPE) + ENDIF() + ENDIF() +ENDFUNCTION() + diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp b/lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in similarity index 96% rename from lib/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp rename to lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in index af44310481..8d1eee31b2 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp +++ b/lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in @@ -1,4 +1,3 @@ - /* //@HEADER // ************************************************************************ @@ -42,6 +41,9 @@ // ************************************************************************ //@HEADER */ +#ifndef @HEADER_GUARD_TAG@ +#define @HEADER_GUARD_TAG@ -#include -#include +@INCLUDE_NEXT_FILE@ + +#endif diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index c0362e4fb0..0259fe69d5 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -21,6 +21,7 @@ #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_MEMKIND #cmakedefine KOKKOS_ENABLE_LIBRT +#cmakedefine KOKKOS_ENABLE_SYCL #ifndef __CUDA_ARCH__ #cmakedefine KOKKOS_ENABLE_TM @@ -31,7 +32,6 @@ #endif /* General Settings */ -#cmakedefine KOKKOS_ENABLE_CXX11 #cmakedefine KOKKOS_ENABLE_CXX14 #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 @@ -58,7 +58,7 @@ /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC #cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HWBSPACE +#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND diff --git a/lib/kokkos/cmake/README.md b/lib/kokkos/cmake/README.md index 6d0cc2daf1..385bbfcd5d 100644 --- a/lib/kokkos/cmake/README.md +++ b/lib/kokkos/cmake/README.md @@ -73,20 +73,20 @@ Compiler features are more fine-grained and require conflicting requests to be r Suppose I have ```` add_library(A a.cpp) -target_compile_features(A PUBLIC cxx_std_11) +target_compile_features(A PUBLIC cxx_std_14) ```` then another target ```` add_library(B b.cpp) -target_compile_features(B PUBLIC cxx_std_14) +target_compile_features(B PUBLIC cxx_std_17) target_link_libraries(A B) ```` I have requested two different features. -CMake understands the requests and knows that `cxx_std_11` is a subset of `cxx_std_14`. -CMake then picks C++14 for library `B`. +CMake understands the requests and knows that `cxx_std_14` is a subset of `cxx_std_17`. +CMake then picks C++17 for library `B`. CMake would not have been able to do feature resolution if we had directly done: ```` -target_compile_options(A PUBLIC -std=c++11) +target_compile_options(A PUBLIC -std=c++14) ```` ### Adding Kokkos Options diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 4876bca259..beaf4e6d6c 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake index b2420d1168..073c40d814 100644 --- a/lib/kokkos/cmake/deps/CUSPARSE.cmake +++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake index ed89c8c1e5..f8402db00a 100644 --- a/lib/kokkos/cmake/deps/HWLOC.cmake +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake index 5f835fc300..639e4ef697 100644 --- a/lib/kokkos/cmake/deps/Pthread.cmake +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index db7680f56a..2e82a46235 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -38,12 +38,6 @@ MACRO(GLOBAL_SET VARNAME) SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) ENDMACRO() -FUNCTION(VERIFY_EMPTY CONTEXT) -if(${ARGN}) -MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") -endif() -ENDFUNCTION() - MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) @@ -89,7 +83,7 @@ FUNCTION(KOKKOS_ADD_TEST) CMAKE_PARSE_ARGUMENTS(TEST "" "EXE;NAME;TOOL" - "" + "ARGS" ${ARGN}) IF(TEST_EXE) SET(EXE_ROOT ${TEST_EXE}) @@ -102,6 +96,7 @@ FUNCTION(KOKKOS_ADD_TEST) NAME ${TEST_NAME} COMM serial mpi NUM_MPI_PROCS 1 + ARGS ${TEST_ARGS} ${TEST_UNPARSED_ARGUMENTS} ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED ) @@ -110,18 +105,25 @@ FUNCTION(KOKKOS_ADD_TEST) SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endforeach() + # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults + # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, + # the test won't actually be added and attempting to set a property on it below + # will yield an error. + if(TARGET ${EXE}) + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + foreach(TEST_ADDED ${ALL_TESTS_ADDED}) + set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") + endforeach() + endif() endif() else() CMAKE_PARSE_ARGUMENTS(TEST "WILL_FAIL" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;CMD_ARGS" + "CATEGORIES;ARGS" ${ARGN}) + SET(TESTS_ADDED) # To match Tribits, we should always be receiving # the root names of exes/libs IF(TEST_EXE) @@ -133,24 +135,46 @@ FUNCTION(KOKKOS_ADD_TEST) # These should be the full target name SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_CMD_ARGS}) + IF (TEST_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${TEST_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST}) + ELSE() + ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST}) + ENDIF() + LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + ENDFOREACH() ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_CMD_ARGS}) + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) + ENDIF() + LIST(APPEND TESTS_ADDED "${TEST_NAME}") ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endif() + + FOREACH(TEST_NAME ${TESTS_ADDED}) + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") + endif() + ENDFOREACH() VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) endif() ENDFUNCTION() diff --git a/lib/kokkos/cmake/intel.cmake b/lib/kokkos/cmake/intel.cmake index f36f01d8ca..7e6ee3358c 100644 --- a/lib/kokkos/cmake/intel.cmake +++ b/lib/kokkos/cmake/intel.cmake @@ -3,7 +3,7 @@ FUNCTION(kokkos_set_intel_flags full_standard int_standard) STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) STRING(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from - # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. + # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) SET(_std -Qstd) SET(_ext c++) @@ -11,20 +11,8 @@ FUNCTION(kokkos_set_intel_flags full_standard int_standard) SET(_std -std) SET(_ext gnu++) ENDIF() - - IF(NOT KOKKOS_CXX_STANDARD STREQUAL 11 AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) - #There is no gnu++14 value supported; figure out what to do. - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "${_std}=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSEIF(KOKKOS_CXX_STANDARD STREQUAL 11 AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=${_ext}c++11" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++11" PARENT_SCOPE) - ENDIF() - ELSE() - MESSAGE(FATAL_ERROR "Intel compiler version too low - need 13.0 for C++11 and 15.0 for C++14") - ENDIF() - + SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) ENDFUNCTION() + diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index d7d32f661c..53aaf7dccf 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -35,6 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Suport") KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") @@ -63,6 +64,7 @@ KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU") KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") @@ -72,6 +74,11 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS) "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic" "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") + # OpenMPTarget compilers give erroneous warnings about sign comparison in loops + IF(KOKKOS_ENABLE_OPENMPTARGET) + LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare") + ENDIF() + SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) @@ -106,6 +113,12 @@ ENDIF() IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) + # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR + IF (Kokkos_CUDA_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + ELSEIF(CUDAToolkit_BIN_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + ENDIF() IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) ENDIF() @@ -167,6 +180,12 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2) ) ENDIF() +IF (KOKKOS_ARCH_A64FX) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -march=armv8.2-a+sve + ) +ENDIF() + IF (KOKKOS_ARCH_ZEN) COMPILER_SPECIFIC_FLAGS( Intel -mavx2 @@ -327,6 +346,16 @@ IF (Kokkos_ENABLE_HIP) ENDIF() +IF (Kokkos_ENABLE_SYCL) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl + ) + COMPILER_SPECIFIC_OPTIONS( + DEFAULT -fsycl-unnamed-lambda + ) +ENDIF() + + SET(CUDA_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) IF(KOKKOS_ARCH_${ARCH}) @@ -392,6 +421,7 @@ ENDFUNCTION() #to the corresponding flag name if ON CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60 +CHECK_AMDGPU_ARCH(VEGA908 gfx908) IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " @@ -477,35 +507,53 @@ ENDIF() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Execution Spaces:") +MESSAGE(STATUS "Built-in Execution Spaces:") -FOREACH (_BACKEND CUDA OPENMPTARGET HIP) - IF(KOKKOS_ENABLE_${_BACKEND}) +FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) IF(_DEVICE_PARALLEL) MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " "but execution space ${_DEVICE_PARALLEL} is already enabled. " "Remove the CMakeCache.txt file and re-configure.") ENDIF() - SET(_DEVICE_PARALLEL ${_BACKEND}) + IF (${_BACKEND} STREQUAL "Cuda") + IF(KOKKOS_ENABLE_CUDA_UVM) + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") + ENDIF() + SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") + SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ENDIF() ENDIF() ENDFOREACH() IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NONE") + SET(_DEVICE_PARALLEL "NoTypeDefined") + SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") ENDIF() MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") -UNSET(_DEVICE_PARALLEL) +IF(KOKKOS_ENABLE_PTHREAD) + SET(KOKKOS_ENABLE_THREADS ON) +ENDIF() - -FOREACH (_BACKEND OPENMP PTHREAD HPX) - IF(KOKKOS_ENABLE_${_BACKEND}) +FOREACH (_BACKEND OpenMP Threads HPX) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) IF(_HOST_PARALLEL) MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " "but execution space ${_HOST_PARALLEL} is already enabled. " "Remove the CMakeCache.txt file and re-configure.") ENDIF() - SET(_HOST_PARALLEL ${_BACKEND}) + IF (${_BACKEND} STREQUAL "HPX") + SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ELSE() + SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") + ENDIF() ENDIF() ENDFOREACH() @@ -515,14 +563,11 @@ IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) "and Kokkos_ENABLE_SERIAL=OFF.") ENDIF() -IF(NOT _HOST_PARALLEL) - SET(_HOST_PARALLEL "NONE") -ENDIF() +IF(_HOST_PARALLEL) MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -UNSET(_HOST_PARALLEL) - -IF(KOKKOS_ENABLE_PTHREAD) - SET(KOKKOS_ENABLE_THREADS ON) +ELSE() + SET(_HOST_PARALLEL "NoTypeDefined") + MESSAGE(STATUS " Host Parallel: NoTypeDefined") ENDIF() IF(KOKKOS_ENABLE_SERIAL) diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 4a77a94e07..e6600161f9 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -4,24 +4,42 @@ SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -IF(Kokkos_ENABLE_CUDA) +MACRO(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + EXECUTE_PROCESS(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - - - STRING(REGEX REPLACE "^ +" "" - INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) SET(INTERNAL_HAVE_COMPILER_NVCC true) ELSE() SET(INTERNAL_HAVE_COMPILER_NVCC false) ENDIF() +ENDMACRO() + +IF(Kokkos_ENABLE_CUDA) + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + # check if compiler was set to nvcc_wrapper + kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) + # if launcher was found and nvcc_wrapper was not specified as + # compiler, set to use launcher. Will ensure CMAKE_CXX_COMPILER + # is replaced by nvcc_wrapper + IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + # the first argument to launcher is always the C++ compiler defined by cmake + # if the second argument matches the C++ compiler, it forwards the rest of the + # args to nvcc_wrapper + kokkos_internal_have_compiler_nvcc( + ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + SET(INTERNAL_USE_COMPILER_LAUNCHER true) + ENDIF() ENDIF() IF(INTERNAL_HAVE_COMPILER_NVCC) @@ -36,6 +54,35 @@ IF(INTERNAL_HAVE_COMPILER_NVCC) STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + IF(INTERNAL_USE_COMPILER_LAUNCHER) + IF(Kokkos_LAUNCH_COMPILER_INFO) + GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) + # does not have STATUS intentionally + MESSAGE("") + MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):") + MESSAGE(" - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.") + MESSAGE(" - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON") + MESSAGE(" by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).") + MESSAGE(" - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.") + MESSAGE(" - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.") + MESSAGE(" - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.") + MESSAGE(" - kokkos_compiler_launcher is available to downstream projects as well.") + MESSAGE(" - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'") + MESSAGE(" - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled") + MESSAGE(" - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'") + MESSAGE(" - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation") + MESSAGE(" - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied") + MESSAGE(" - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis") + MESSAGE(" - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject") + MESSAGE(" - 'kokkos_compilation(TARGET [...])' will apply the compiler transformation to the specified target(s)") + MESSAGE(" - 'kokkos_compilation(SOURCE [...])' will apply the compiler transformation to the specified source file(s)") + MESSAGE(" - 'kokkos_compilation(DIRECTORY [...])' will apply the compiler transformation to the specified directories") + MESSAGE("") + ELSE() + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.") + ENDIF() + kokkos_compilation(GLOBAL) + ENDIF() ENDIF() IF(Kokkos_ENABLE_HIP) @@ -90,38 +137,49 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) ENDIF() ENDIF() +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) + # SET Fujitsus compiler version which is not detected by CMake + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +ENDIF() + # Enforce the minimum compilers supported by Kokkos. SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 3.5.2 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 4.8.4 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 15.0.2 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.0.69 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.5.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.1 or higher\n") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 4.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 5.3.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 17.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.2.88 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.8.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.4 or higher\n") IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.2) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.8.4) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 5.3.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 9.0.69) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 9.2.88) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.1) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index a792590bac..3962c4b16e 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,4 @@ -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT KOKKOS_COMPILER_CLANG_MSVC) # The clang "version" doesn't actually tell you what runtimes and tools # were built into Clang. We should therefore make sure that libomp # was actually built into Clang. Otherwise the user will get nonsensical diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index 7d1c375ae6..41ee10a8a0 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -25,6 +25,18 @@ IF (KOKKOS_ENABLE_PTHREAD) SET(KOKKOS_ENABLE_THREADS ON) ENDIF() +# detect clang++ / cl / clang-cl clashes +IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + # this specific test requires CMake >= 3.15 + IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + # use pure clang++ instead of clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC OFF) + ELSE() + # it defaults to clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC ON) + ENDIF() +ENDIF() + IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) SET(OMP_DEFAULT ON) ELSE() @@ -39,13 +51,16 @@ IF(KOKKOS_ENABLE_OPENMP) IF(KOKKOS_CLANG_IS_INTEL) SET(ClangOpenMPFlag -fiopenmp) ENDIF() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - #expression /openmp yields error, so add a specific Clang flag - COMPILER_SPECIFIC_OPTIONS(Clang /clang:-fopenmp) - #link omp library from LLVM lib dir + IF(KOKKOS_COMPILER_CLANG_MSVC) + #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag + SET(ClangOpenMPFlag /clang:-fopenmp=libomp) + ENDIF() + IF(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL Clang) + #link omp library from LLVM lib dir, no matter if it is clang-cl or clang++ get_filename_component(LLVM_BIN_DIR ${CMAKE_CXX_COMPILER_AR} DIRECTORY) COMPILER_SPECIFIC_LIBS(Clang "${LLVM_BIN_DIR}/../lib/libomp.lib") - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + ENDIF() + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} @@ -71,7 +86,7 @@ ENDIF() KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") IF (KOKKOS_ENABLE_OPENMPTARGET) -SET(ClangOpenMPFlag -fopenmp=libomp) + SET(ClangOpenMPFlag -fopenmp=libomp) IF(KOKKOS_CLANG_IS_CRAY) SET(ClangOpenMPFlag -fopenmp) ENDIF() @@ -105,9 +120,11 @@ KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend" IF (KOKKOS_ENABLE_CUDA) GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") - IF(WIN32) + IF(WIN32 AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS -x cu) ENDIF() +## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + LIST(APPEND DEVICE_SETUP_LIST Cuda) ENDIF() # We want this to default to OFF for cache reasons, but if no @@ -128,3 +145,10 @@ KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial back KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + +## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros +IF (KOKKOS_ENABLE_HIP) + LIST(APPEND DEVICE_SETUP_LIST HIP) +ENDIF() + +KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 7ce3ed501e..2b17d648b4 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -154,13 +154,13 @@ MACRO(kokkos_export_imported_tpl NAME) KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}") + KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") ENDIF() ENDIF() GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES ${TPL_INCLUDES}") + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") ENDIF() GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) @@ -178,7 +178,7 @@ MACRO(kokkos_export_imported_tpl NAME) GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES ${TPL_LINK_LIBRARIES}") + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") ENDIF() KOKKOS_APPEND_CONFIG_LINE(")") KOKKOS_APPEND_CONFIG_LINE("ENDIF()") @@ -770,7 +770,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" @@ -844,7 +844,6 @@ ENDFUNCTION(COMPILER_SPECIFIC_DEFS) FUNCTION(COMPILER_SPECIFIC_LIBS) COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) ENDFUNCTION(COMPILER_SPECIFIC_LIBS) - # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -877,3 +876,114 @@ FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) ENDIF() ENDFOREACH() ENDFUNCTION() + +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + IF(DEFINED ${_VAR}) + RETURN() + ENDIF() + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp +" +#include +#include + +__global__ +void kernel(int sz, double* data) +{ + auto _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast(i); +} + +int main() +{ + double* data = nullptr; + int blocks = 64; + int grids = 64; + auto ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + TRY_COMPILE(_RET + ${PROJECT_BINARY_DIR}/compile_tests + SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + + SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +ENDFUNCTION() + +# this function is provided to easily select which files use nvcc_wrapper: +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +FUNCTION(kokkos_compilation) + # check whether the compiler already supports building CUDA + KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) + RETURN() + ENDIF() + + CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() +## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names +## and create output config header file...used for +## creating dynamic include files based on enabled backends +## +## SRC_FILE is input file +## TARGET_FILE output file +## HEADER_GUARD TEXT used with include header guard +## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) +## DATA_LIST list of backends to include in generated file +FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + FOREACH( BACKEND_NAME ${DATA_LIST} ) + SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@") + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + ENDFOREACH() + SET(INCLUDE_NEXT_FILE "" ) + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +ENDFUNCTION() diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index cf14948f43..015873ebd6 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,19 +1,17 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 11, 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 11") +KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 14") # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX11 OFF) SET(KOKKOS_ENABLE_CXX14 OFF) SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) IF (KOKKOS_CXX_STANDARD) IF (${KOKKOS_CXX_STANDARD} STREQUAL "c++98") - MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++11") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++14") ELSEIF (${KOKKOS_CXX_STANDARD} STREQUAL "c++11") - MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++11'. Use '11' instead.") - SET(KOKKOS_CXX_STANDARD "11") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++11 - minimum C++14") ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++14") MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++14'. Use '14' instead.") SET(KOKKOS_CXX_STANDARD "14") @@ -33,8 +31,8 @@ IF (KOKKOS_CXX_STANDARD) ENDIF() IF (NOT KOKKOS_CXX_STANDARD AND NOT CMAKE_CXX_STANDARD) - MESSAGE(STATUS "Setting default Kokkos CXX standard to 11") - SET(KOKKOS_CXX_STANDARD "11") + MESSAGE(STATUS "Setting default Kokkos CXX standard to 14") + SET(KOKKOS_CXX_STANDARD "14") ELSEIF(NOT KOKKOS_CXX_STANDARD) MESSAGE(STATUS "Setting default Kokkos CXX standard to ${CMAKE_CXX_STANDARD}") SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index cb857bc11e..1d7da922eb 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -29,7 +29,7 @@ FUNCTION(kokkos_set_cxx_standard_feature standard) ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - SET(SUPPORTED_NVCC_FLAGS "-std=c++11;-std=c++14;-std=c++17") + SET(SUPPORTED_NVCC_FLAGS "-std=c++14;-std=c++17") IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") ENDIF() @@ -42,13 +42,16 @@ FUNCTION(kokkos_set_cxx_standard_feature standard) ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") + ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) + MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") ELSE() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferrably including your CMake command.") + MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") ENDIF() - IF(NOT WIN32) + IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") ENDIF() @@ -65,11 +68,7 @@ IF (KOKKOS_CXX_STANDARD AND CMAKE_CXX_STANDARD) ENDIF() -IF (KOKKOS_CXX_STANDARD STREQUAL "11" ) - kokkos_set_cxx_standard_feature(11) - SET(KOKKOS_ENABLE_CXX11 ON) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "11") -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "14") +IF(KOKKOS_CXX_STANDARD STREQUAL "14") kokkos_set_cxx_standard_feature(14) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Y") SET(KOKKOS_ENABLE_CXX14 ON) @@ -81,21 +80,21 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "98") - MESSAGE(FATAL_ERROR "Kokkos requires C++11 or newer!") +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "98" OR KOKKOS_CXX_STANDARD STREQUAL "11") + MESSAGE(FATAL_ERROR "Kokkos requires C++14 or newer!") ELSE() - MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 11, 14, 17, or 20") + MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") ENDIF() # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's -# flags for turning on C++11. Since for compiler ID and versioning purposes +# flags for turning on C++14. Since for compiler ID and versioning purposes # CMake recognizes the host compiler when calling nvcc_wrapper, this just -# works. Both NVCC and nvcc_wrapper only recognize '-std=c++11' which means +# works. Both NVCC and nvcc_wrapper only recognize '-std=c++14' which means # that we can only use host compilers for CUDA builds that use those flags. -# It also means that extensions (gnu++11) can't be turned on for CUDA builds. +# It also means that extensions (gnu++14) can't be turned on for CUDA builds. IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) @@ -117,7 +116,7 @@ IF(KOKKOS_ENABLE_CUDA) MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") ENDIF() ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index 9d9be87834..b58d3696ea 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -76,3 +76,7 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +IF (KOKKOS_ENABLE_MEMKIND) + SET(KOKKOS_ENABLE_HBWSPACE) + LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) +ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 4bd186dac7..059fb192f0 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -6,6 +6,12 @@ INCLUDE(GNUInstallDirs) MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +FUNCTION(VERIFY_EMPTY CONTEXT) + if(${ARGN}) + MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + endif() +ENDFUNCTION() + #Leave this here for now - but only do for tribits #This breaks the standalone CMake IF (KOKKOS_HAS_TRILINOS) @@ -135,28 +141,37 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) ENDFUNCTION() FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) +VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() TRIBITS_ADD_EXECUTABLE_AND_TEST( ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} TESTONLYLIBS kokkos_gtest - ${ARGN} NUM_MPI_PROCS 1 COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} ) ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} SOURCES ${PARSE_SOURCES} ) KOKKOS_ADD_TEST(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} ) ENDIF() ENDFUNCTION() @@ -219,6 +234,7 @@ MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS kokkos_gtest ) + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) ENDMACRO() MACRO(KOKKOS_PACKAGE_POSTPROCESS) @@ -227,6 +243,79 @@ MACRO(KOKKOS_PACKAGE_POSTPROCESS) endif() ENDMACRO() +## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based +## on enabled backends. +## KOKKOS_FWD is the forward declare set +## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines +## KOKKOS_DECLARE is the declaration set +## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp +MACRO(KOKKOS_CONFIGURE_CORE) + SET(FWD_BACKEND_LIST) + FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) + LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) + ENDFOREACH() + FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) + IF( ${BACKEND_} STREQUAL "PTHREAD") + LIST(APPEND FWD_BACKEND_LIST THREADS) + ELSE() + LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) + ENDIF() + ENDFOREACH() + MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") + KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") + KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") + KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space") + KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space") + IF (NOT Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE STREQUAL "") + SET(_DEVICE_PARALLEL ${Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default device execution space: ${_DEVICE_PARALLEL}") + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ELSE() + IF (_DEVICE_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_DEVICE_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + IF (NOT Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE STREQUAL "") + SET(_HOST_PARALLEL ${Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default host parallel execution space: ${_HOST_PARALLEL}") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ELSE() + IF (_HOST_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + #We are ready to configure the header + CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +ENDMACRO() + +## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. +## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, +## as well as other files provided through plugins. +MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper + INSTALL(PROGRAMS + "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + DESTINATION ${CMAKE_INSTALL_BINDIR}) + INSTALL(FILES + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +ENDMACRO() + FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) CMAKE_PARSE_ARGUMENTS(PARSE "PLAIN_STYLE" diff --git a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake index a59868b73b..1ae4f19dd4 100644 --- a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake index a4c55e1d7b..467635083f 100644 --- a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake index 4dc1a87e18..c78630b7f1 100644 --- a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 1011cb8fd1..43c66c24fd 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -3,44 +3,26 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -IF(Kokkos_ENABLE_CUDA) - SET(SOURCES - TestMain.cpp - TestCuda.cpp - ) +foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + string(TOLOWER ${Tag} dir) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Cuda - SOURCES ${SOURCES} - ) -ENDIF() + if(Kokkos_ENABLE_${DEVICE}) + message(STATUS "Sources Test${Tag}.cpp") -IF(Kokkos_ENABLE_PTHREAD) - SET(SOURCES - TestMain.cpp - TestThreads.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Threads - SOURCES ${SOURCES} - ) -ENDIF() - -IF(Kokkos_ENABLE_OPENMP) - SET(SOURCES - TestMain.cpp - TestOpenMP.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_OpenMP - SOURCES ${SOURCES} - ) -ENDIF() - -IF(Kokkos_ENABLE_HPX) - SET(SOURCES - TestMain.cpp - TestHPX.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_HPX - SOURCES ${SOURCES} - ) -ENDIF() + set(SOURCES + TestMain.cpp + Test${Tag}.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_${Tag} + SOURCES ${SOURCES} + ) + endif() +endforeach() diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile index 8ef1dd9938..cbb8490798 100644 --- a/lib/kokkos/containers/performance_tests/Makefile +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -58,8 +58,8 @@ endif KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda -KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm +KokkosContainers_PerformanceTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HIP KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads @@ -73,8 +73,8 @@ KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosContainers_PerformanceTest_Cuda ./KokkosContainers_PerformanceTest_Cuda -test-rocm: KokkosContainers_PerformanceTest_ROCm - ./KokkosContainers_PerformanceTest_ROCm +test-hip: KokkosContainers_PerformanceTest_HIP + ./KokkosContainers_PerformanceTest_HIP test-threads: KokkosContainers_PerformanceTest_Threads ./KokkosContainers_PerformanceTest_Threads diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp index 697a006c3c..8874590e2a 100644 --- a/lib/kokkos/containers/performance_tests/TestCuda.cpp +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -66,23 +65,13 @@ namespace Performance { -class cuda : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - Kokkos::InitArguments args(-1, -1, 0); - Kokkos::initialize(args); - } - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(cuda, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "Cuda" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(40960); } -TEST_F(cuda, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "Cuda" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,15 +79,12 @@ TEST_F(cuda, global_2_local) { test_global_to_local_ids(i); } -TEST_F(cuda, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { Perf::run_performance_tests("cuda-near"); } -TEST_F(cuda, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { Perf::run_performance_tests("cuda-far"); } } // namespace Performance -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTCUDA_PREVENT_EMPTY_LINK_ERROR() {} -#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ diff --git a/lib/kokkos/containers/performance_tests/TestROCm.cpp b/lib/kokkos/containers/performance_tests/TestHIP.cpp similarity index 67% rename from lib/kokkos/containers/performance_tests/TestROCm.cpp rename to lib/kokkos/containers/performance_tests/TestHIP.cpp index 55b770b49c..8033c76be6 100644 --- a/lib/kokkos/containers/performance_tests/TestROCm.cpp +++ b/lib/kokkos/containers/performance_tests/TestHIP.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_ROCM) #include #include @@ -66,46 +65,26 @@ namespace Performance { -class rocm : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Experimental::ROCm::initialize( - Kokkos::Experimental::ROCm::SelectDevice(0)); - } - static void TearDownTestCase() { - Kokkos::Experimental::ROCm::finalize(); - Kokkos::HostSpace::execution_space::finalize(); - } -}; -#if 0 -// issue 1089 -TEST_F( rocm, dynrankview_perf ) -{ - std::cout << "ROCm" << std::endl; +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "HIP" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; - test_dynrankview_op_perf( 40960 ); + test_dynrankview_op_perf(40960); } -TEST_F( rocm, global_2_local) -{ - std::cout << "ROCm" << std::endl; +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "HIP" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; - for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) - test_global_to_local_ids(i); + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids(i); } -#endif -TEST_F(rocm, unordered_map_performance_near) { - Perf::run_performance_tests("rocm-near"); +TEST(TEST_CATEGORY, unordered_map_performance_near) { + Perf::run_performance_tests("hip-near"); } -TEST_F(rocm, unordered_map_performance_far) { - Perf::run_performance_tests("rocm-far"); +TEST(TEST_CATEGORY, unordered_map_performance_far) { + Perf::run_performance_tests("hip-far"); } } // namespace Performance -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {} -#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */ diff --git a/lib/kokkos/containers/performance_tests/TestHPX.cpp b/lib/kokkos/containers/performance_tests/TestHPX.cpp index 48be466bfa..f229901dcc 100644 --- a/lib/kokkos/containers/performance_tests/TestHPX.cpp +++ b/lib/kokkos/containers/performance_tests/TestHPX.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_HPX) #include @@ -64,25 +63,13 @@ namespace Performance { -class hpx : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - Kokkos::initialize(); - Kokkos::print_configuration(std::cout); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(hpx, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "HPX" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(hpx, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "HPX" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,7 +77,7 @@ TEST_F(hpx, global_2_local) { test_global_to_local_ids(i); } -TEST_F(hpx, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-near"; @@ -98,7 +85,7 @@ TEST_F(hpx, unordered_map_performance_near) { base_file_name.str()); } -TEST_F(hpx, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-far"; @@ -106,7 +93,7 @@ TEST_F(hpx, unordered_map_performance_far) { base_file_name.str()); } -TEST_F(hpx, scatter_view) { +TEST(TEST_CATEGORY, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view #include -#include +#include int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; } diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp index a9c8639ed4..f414b0d828 100644 --- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_OPENMP) #include @@ -64,25 +63,13 @@ namespace Performance { -class openmp : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - Kokkos::initialize(); - Kokkos::OpenMP::print_configuration(std::cout); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(openmp, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "OpenMP" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(openmp, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "OpenMP" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,7 +77,7 @@ TEST_F(openmp, global_2_local) { test_global_to_local_ids(i); } -TEST_F(openmp, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -102,7 +89,7 @@ TEST_F(openmp, unordered_map_performance_near) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(openmp, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -114,7 +101,7 @@ TEST_F(openmp, unordered_map_performance_far) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(openmp, scatter_view) { +TEST(TEST_CATEGORY, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view -#if defined(KOKKOS_ENABLE_THREADS) #include @@ -65,34 +64,13 @@ namespace Performance { -class threads : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - unsigned num_threads = 4; - - if (Kokkos::hwloc::available()) { - num_threads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - - std::cout << "Threads: " << num_threads << std::endl; - - Kokkos::initialize(Kokkos::InitArguments(num_threads)); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(threads, dynrankview_perf) { +TEST(threads, dynrankview_perf) { std::cout << "Threads" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(threads, global_2_local) { +TEST(threads, global_2_local) { std::cout << "Threads" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -100,7 +78,7 @@ TEST_F(threads, global_2_local) { test_global_to_local_ids(i); } -TEST_F(threads, unordered_map_performance_near) { +TEST(threads, unordered_map_performance_near) { unsigned num_threads = 4; if (Kokkos::hwloc::available()) { num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -112,7 +90,7 @@ TEST_F(threads, unordered_map_performance_near) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(threads, unordered_map_performance_far) { +TEST(threads, unordered_map_performance_far) { unsigned num_threads = 4; if (Kokkos::hwloc::available()) { num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -125,8 +103,3 @@ TEST_F(threads, unordered_map_performance_far) { } } // namespace Performance - -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTTHREADS_PREVENT_EMPTY_LINK_ERROR() { -} -#endif diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index eedfd5f9ef..ea1d6dde5d 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -74,7 +74,7 @@ template class Bitset { public: using execution_space = Device; - using size_type = unsigned; + using size_type = unsigned int; enum { BIT_SCAN_REVERSE = 1u }; enum { MOVE_HINT_BACKWARD = 2u }; @@ -309,7 +309,7 @@ template class ConstBitset { public: using execution_space = Device; - using size_type = unsigned; + using size_type = unsigned int; private: enum { block_size = static_cast(sizeof(unsigned) * CHAR_BIT) }; diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 3fc0371c69..689f0eb2ed 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -162,7 +162,7 @@ class DualView : public ViewTraits { /// \brief The type of a const, random-access View host mirror of /// \c t_dev_const_randomread. using t_host_const_randomread_um = - typename t_dev_const_randomread::HostMirror; + typename t_dev_const_randomread_um::HostMirror; //@} //! \name Counters to keep track of changes ("modified" flags) @@ -245,21 +245,6 @@ class DualView : public ViewTraits { h_view(create_mirror_view(d_view)) // without UVM, host View mirrors {} - explicit inline DualView(const ViewAllocateWithoutInitializing& arg_prop, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : DualView(Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7) {} - //! Copy constructor (shallow copy) template DualView(const DualView& src) @@ -457,7 +442,27 @@ class DualView : public ViewTraits { } return dev; } - + static constexpr const int view_header_size = 128; + void impl_report_host_sync() const noexcept { + if (Kokkos::Tools::Experimental::get_callbacks().sync_dual_view != + nullptr) { + Kokkos::Tools::syncDualView( + h_view.label(), + reinterpret_cast(reinterpret_cast(h_view.data()) - + view_header_size), + false); + } + } + void impl_report_device_sync() const noexcept { + if (Kokkos::Tools::Experimental::get_callbacks().sync_dual_view != + nullptr) { + Kokkos::Tools::syncDualView( + d_view.label(), + reinterpret_cast(reinterpret_cast(d_view.data()) - + view_header_size), + true); + } + } /// \brief Update data on device or host only if data in the other /// space has been marked as modified. /// @@ -499,6 +504,7 @@ class DualView : public ViewTraits { deep_copy(d_view, h_view); modified_flags(0) = modified_flags(1) = 0; + impl_report_device_sync(); } } if (dev == 0) { // hopefully Device is the same as DualView's host type @@ -515,6 +521,7 @@ class DualView : public ViewTraits { deep_copy(h_view, d_view); modified_flags(0) = modified_flags(1) = 0; + impl_report_host_sync(); } } if (std::is_same { Impl::throw_runtime_exception( "Calling sync on a DualView with a const datatype."); } + impl_report_device_sync(); } if (dev == 0) { // hopefully Device is the same as DualView's host type if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) { Impl::throw_runtime_exception( "Calling sync on a DualView with a const datatype."); } + impl_report_host_sync(); } } @@ -567,6 +576,7 @@ class DualView : public ViewTraits { deep_copy(h_view, d_view); modified_flags(1) = modified_flags(0) = 0; + impl_report_host_sync(); } } @@ -589,6 +599,7 @@ class DualView : public ViewTraits { deep_copy(d_view, h_view); modified_flags(1) = modified_flags(0) = 0; + impl_report_device_sync(); } } @@ -619,7 +630,26 @@ class DualView : public ViewTraits { if (modified_flags.data() == nullptr) return false; return modified_flags(1) < modified_flags(0); } - + void impl_report_device_modification() { + if (Kokkos::Tools::Experimental::get_callbacks().modify_dual_view != + nullptr) { + Kokkos::Tools::modifyDualView( + d_view.label(), + reinterpret_cast(reinterpret_cast(d_view.data()) - + view_header_size), + true); + } + } + void impl_report_host_modification() { + if (Kokkos::Tools::Experimental::get_callbacks().modify_dual_view != + nullptr) { + Kokkos::Tools::modifyDualView( + h_view.label(), + reinterpret_cast(reinterpret_cast(h_view.data()) - + view_header_size), + false); + } + } /// \brief Mark data as modified on the given device \c Device. /// /// If \c Device is the same as this DualView's device type, then @@ -636,6 +666,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_device_modification(); } if (dev == 0) { // hopefully Device is the same as DualView's host type // Increment the host's modified count. @@ -643,6 +674,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_host_modification(); } #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -663,6 +695,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_host_modification(); #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK if (modified_flags(0) && modified_flags(1)) { std::string msg = "Kokkos::DualView::modify_host ERROR: "; @@ -682,6 +715,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_device_modification(); #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK if (modified_flags(0) && modified_flags(1)) { std::string msg = "Kokkos::DualView::modify_device ERROR: "; diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index afb4b682c4..c66d7a5f36 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -245,10 +245,13 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( return (size_t(i) < map.extent(R)) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else if (i != 0) { + // FIXME_SYCL SYCL doesn't allow printf in kernels +#ifndef KOKKOS_ENABLE_SYCL printf( "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " "arguments beyond the rank must be zero \n", R); +#endif return (false) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else { @@ -1264,33 +1267,6 @@ class DynRankView : public ViewTraits { typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} - // For backward compatibility - // NDE This ctor does not take ViewCtorProp argument - should not use - // alternative createLayout call - explicit inline DynRankView(const ViewAllocateWithoutInitializing& arg_prop, - const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - arg_layout) {} - - explicit inline DynRankView(const ViewAllocateWithoutInitializing& arg_prop, - const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} - //---------------------------------------- // Memory span required to wrap these dimensions. static constexpr size_t required_allocation_size( @@ -1401,7 +1377,7 @@ struct DynRankSubviewTag {}; namespace Impl { template -struct ViewMapping< +class ViewMapping< typename std::enable_if< (std::is_same::value && (std::is_same::view_type; std::string label = name.empty() ? src.label() : name; - auto mirror = Mirror(Kokkos::ViewAllocateWithoutInitializing(label), + auto mirror = Mirror(view_alloc(WithoutInitializing, label), Impl::reconstructLayout(src.layout(), src.rank())); deep_copy(mirror, src); return mirror; diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 9233499bf4..4fd084338e 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -1940,7 +1940,7 @@ create_mirror( const Kokkos::Experimental::OffsetView& src, typename std::enable_if< !std::is_same::array_layout, - Kokkos::LayoutStride>::value>::type* = 0) { + Kokkos::LayoutStride>::value>::type* = nullptr) { using src_type = Experimental::OffsetView; using dst_type = typename src_type::HostMirror; @@ -1960,7 +1960,7 @@ create_mirror( const Kokkos::Experimental::OffsetView& src, typename std::enable_if< std::is_same::array_layout, - Kokkos::LayoutStride>::value>::type* = 0) { + Kokkos::LayoutStride>::value>::type* = nullptr) { using src_type = Experimental::OffsetView; using dst_type = typename src_type::HostMirror; @@ -2028,7 +2028,7 @@ create_mirror_view( std::is_same< typename Kokkos::Experimental::OffsetView::data_type, typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value)>::type* = 0) { + T, P...>::HostMirror::data_type>::value)>::type* = nullptr) { return Kokkos::create_mirror(src); } @@ -2038,7 +2038,7 @@ typename Kokkos::Impl::MirrorOffsetViewType::view_type create_mirror_view(const Space&, const Kokkos::Experimental::OffsetView& src, typename std::enable_if::is_same_memspace>::type* = 0) { + Space, T, P...>::is_same_memspace>::type* = nullptr) { return src; } @@ -2048,7 +2048,7 @@ typename Kokkos::Impl::MirrorOffsetViewType::view_type create_mirror_view(const Space&, const Kokkos::Experimental::OffsetView& src, typename std::enable_if::is_same_memspace>::type* = 0) { + Space, T, P...>::is_same_memspace>::type* = nullptr) { return typename Kokkos::Impl::MirrorOffsetViewType::view_type( src.label(), src.layout(), {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), @@ -2063,7 +2063,7 @@ create_mirror_view(const Space&, // , std::string const& name = "" // , typename // std::enable_if::is_same_memspace>::type* = 0 ) { +// ...>::is_same_memspace>::type* = nullptr) { // (void)name; // return src; // } @@ -2076,11 +2076,11 @@ create_mirror_view(const Space&, // , std::string const& name = "" // , typename // std::enable_if::is_same_memspace>::type* = 0 ) { +// ...>::is_same_memspace>::type* = nullptr) { // using Mirror = typename // Kokkos::Experimental::Impl::MirrorViewType::view_type; // std::string label = name.empty() ? src.label() : name; -// auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout(), +// auto mirror = Mirror(view_alloc(WithoutInitializing, label), src.layout(), // { src.begin(0), src.begin(1), src.begin(2), // src.begin(3), src.begin(4), // src.begin(5), src.begin(6), src.begin(7) }); diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 3df0dfcd3b..5e18f5a80e 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -206,6 +206,23 @@ struct DefaultContribution +struct DefaultDuplication { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution { + using type = Kokkos::Experimental::ScatterAtomic; +}; +#endif + // FIXME All these scatter values need overhaul: // - like should they be copyable at all? // - what is the internal handle type @@ -636,19 +653,10 @@ struct ReduceDuplicatesBase { size_t stride_in, size_t start_in, size_t n_in, std::string const& name) : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, - &kpID); - } - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*(static_cast(this)), - policy_type(0, stride)); - closure.execute(); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + parallel_for( + std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]", + RangePolicy(0, stride), + static_cast(*this)); } }; @@ -682,19 +690,10 @@ struct ResetDuplicatesBase { ResetDuplicatesBase(ValueType* data_in, size_t size_in, std::string const& name) : data(data_in) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, - &kpID); - } - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*(static_cast(this)), - policy_type(0, size_in)); - closure.execute(); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + parallel_for( + std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]", + RangePolicy(0, size_in), + static_cast(*this)); } }; @@ -931,8 +930,8 @@ class ScatterView const& original_view) : unique_token(), internal_view( - Kokkos::ViewAllocateWithoutInitializing(std::string("duplicated_") + - original_view.label()), + view_alloc(WithoutInitializing, + std::string("duplicated_") + original_view.label()), unique_token.size(), original_view.rank_dynamic > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -955,7 +954,7 @@ class ScatterView ScatterView(std::string const& name, Dims... dims) - : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), + : internal_view(view_alloc(WithoutInitializing, name), unique_token.size(), dims...) { reset(); } @@ -1094,8 +1093,8 @@ class ScatterView\n" "#include \n" ) + configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() + list(REMOVE_ITEM UnitTestSources + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index 308b5aa8b5..f42b9b7519 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -7,7 +7,7 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads -vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/rocm +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hip vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/cuda vpath %.cpp ${CURDIR} default: build_all diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index ae5b746f94..531caf0f85 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -108,7 +108,7 @@ struct test_dualview_combinations { if (with_init) { a = ViewType("A", n, m); } else { - a = ViewType(Kokkos::ViewAllocateWithoutInitializing("A"), n, m); + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); } Kokkos::deep_copy(a.d_view, 1); @@ -404,14 +404,19 @@ void test_dualview_resize() { Impl::test_dualview_resize(); } +// FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10, true); } +#endif TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } +// FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } @@ -428,6 +433,7 @@ TEST(TEST_CATEGORY, dualview_realloc) { TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); } +#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 97155d3047..dd0199ed81 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -1063,8 +1063,8 @@ class TestDynViewAPI { (void)thing; } - dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"), - 10, 20); + dView0 d_uninitialized( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20); ASSERT_TRUE(d_uninitialized.data() != nullptr); ASSERT_EQ(d_uninitialized.rank(), 2); ASSERT_EQ(d_uninitialized.extent(0), 10); @@ -1532,7 +1532,7 @@ class TestDynViewAPI { ASSERT_EQ(ds5.extent(5), ds5plus.extent(5)); #if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \ - !defined(KOKKOS_ENABLE_HIP) + !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0); ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0), 0); // passing argument to rank beyond the view's rank is allowed diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index f018793dd6..4b9f994417 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -243,6 +243,8 @@ struct TestDynamicView { } }; +// FIXME_SYCL needs resize_serial +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dynamic_view) { using TestDynView = TestDynamicView; @@ -250,6 +252,7 @@ TEST(TEST_CATEGORY, dynamic_view) { TestDynView::run(100000 + 100 * i); } } +#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index e5186e3e1e..802813b13b 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -95,10 +95,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(1), 5); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - const int ovmin0 = ov.begin(0); - const int ovend0 = ov.end(0); - const int ovmin1 = ov.begin(1); - const int ovend1 = ov.end(1); { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -134,6 +130,13 @@ void test_offsetview_construction() { } } + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL + const int ovmin0 = ov.begin(0); + const int ovend0 = ov.end(0); + const int ovmin1 = ov.begin(1); + const int ovend1 = ov.end(1); + using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -175,6 +178,7 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; +#endif #endif { @@ -211,6 +215,8 @@ void test_offsetview_construction() { point3_type{{extent0, extent1, extent2}}); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifdef KOKKOS_ENABLE_SYCL int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -233,6 +239,7 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; +#endif #endif } view_type viewFromOV = ov.view(); @@ -259,6 +266,8 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, ov); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -268,6 +277,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; +#endif #endif } @@ -278,6 +288,8 @@ void test_offsetview_construction() { Kokkos::deep_copy(ov, aView); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -287,6 +299,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; +#endif #endif } } @@ -458,6 +471,8 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.end(1), 9); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -483,6 +498,7 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); +#endif #endif } @@ -685,9 +701,12 @@ void test_offsetview_offsets_rank3() { } #endif +// FIXME_SYCL needs MDRangePolicy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); } +#endif TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); diff --git a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp new file mode 100644 index 0000000000..51fd3fc911 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SYCL_HPP +#define KOKKOS_TEST_SYCL_HPP + +#define TEST_CATEGORY sycl +#define TEST_EXECSPACE Kokkos::Experimental::SYCL + +#endif diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 4ec83baece..3a3cb607a6 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -583,18 +583,9 @@ struct TestDuplicatedScatterView< }; #endif -#ifdef KOKKOS_ENABLE_ROCM -// disable duplicated instantiation with ROCm until -// UniqueToken can support it -template -struct TestDuplicatedScatterView { - TestDuplicatedScatterView(int) {} -}; -#endif - template -void test_scatter_view(int n) { +void test_scatter_view(int64_t n) { using execution_space = typename DeviceType::execution_space; // no atomics or duplication is only sensible if the execution space @@ -630,7 +621,7 @@ void test_scatter_view(int n) { constexpr std::size_t bytes_per_value = sizeof(NumberType) * 12; std::size_t const maximum_allowed_copy_values = maximum_allowed_copy_bytes / bytes_per_value; - n = std::min(n, int(maximum_allowed_copy_values)); + n = std::min(n, int64_t(maximum_allowed_copy_values)); // if the default is duplicated, this needs to follow the limit { @@ -683,32 +674,40 @@ TEST(TEST_CATEGORY, scatterview_devicetype) { test_scatter_view(10); test_scatter_view(10); +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - using cuda_device_type = Kokkos::Device; - test_scatter_view::value) { + using device_device_type = + Kokkos::Device; + test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - using cudauvm_device_type = - Kokkos::Device; - test_scatter_view( + 10); + test_scatter_view(10); + test_scatter_view(10); + using host_device_type = + Kokkos::Device; + test_scatter_view(10); - test_scatter_view(10); - test_scatter_view( - 10); - test_scatter_view( - 10); - test_scatter_view( - 10); + test_scatter_view(10); + test_scatter_view(10); + test_scatter_view(10); } #endif } } // namespace Test -#endif // KOKKOS_TEST_UNORDERED_MAP_HPP +#endif // KOKKOS_TEST_SCATTER_VIEW_HPP diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index 89c69756d8..8bb267ce5d 100644 --- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -200,8 +200,7 @@ void run_test_graph3(size_t B, size_t N) { for (size_t i = 0; i < B; i++) { size_t ne = 0; - for (size_t j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); - j++) + for (auto j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); j++) ne += hx.row_map(j + 1) - hx.row_map(j) + C; ASSERT_FALSE( @@ -212,7 +211,7 @@ void run_test_graph3(size_t B, size_t N) { template void run_test_graph4() { - using ordinal_type = unsigned; + using ordinal_type = unsigned int; using layout_type = Kokkos::LayoutRight; using space_type = Space; using memory_traits_type = Kokkos::MemoryUnmanaged; @@ -286,7 +285,10 @@ void run_test_graph4() { TEST(TEST_CATEGORY, staticcrsgraph) { TestStaticCrsGraph::run_test_graph(); + // FIXME_SYCL requires MDRangePolicy +#ifndef KOKKOS_ENABLE_SYCL TestStaticCrsGraph::run_test_graph2(); +#endif TestStaticCrsGraph::run_test_graph3(1, 0); TestStaticCrsGraph::run_test_graph3(1, 1000); TestStaticCrsGraph::run_test_graph3(1, 10000); diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp index 296b9a7e64..33b265e077 100644 --- a/lib/kokkos/containers/unit_tests/TestVector.hpp +++ b/lib/kokkos/containers/unit_tests/TestVector.hpp @@ -78,7 +78,7 @@ struct test_vector_insert { // Looks like some std::vector implementations do not have the restriction // right on the overload taking three iterators, and thus the following call // will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) // And at least GCC 4.8.4 doesn't implement vector insert correct for C++11 // Return type is void ... #if (__GNUC__ < 5) @@ -104,7 +104,7 @@ struct test_vector_insert { // Looks like some std::vector implementations do not have the restriction // right on the overload taking three iterators, and thus the following call // will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) b.insert(b.begin(), typename Vector::size_type(7), 9); #else b.insert(b.begin(), 7, 9); @@ -125,7 +125,7 @@ struct test_vector_insert { // Testing insert at end via all three function interfaces a.insert(a.end(), 11); -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) a.insert(a.end(), typename Vector::size_type(2), 12); #else a.insert(a.end(), 2, 12); diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in index e930f6a05e..f0835772b8 100644 --- a/lib/kokkos/core/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in @@ -100,6 +100,5 @@ // TODO: No longer options in Kokkos. Need to be removed. #cmakedefine KOKKOS_USING_DEPRECATED_VIEW -#cmakedefine KOKKOS_ENABLE_CXX11 #endif // !defined(KOKKOS_FOR_SIERRA) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index f55721e04a..b7b817c910 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -48,17 +48,10 @@ SET(SOURCES PerfTest_ViewResize_8.cpp ) -IF(Kokkos_ENABLE_HIP) -# FIXME HIP requires TeamPolicy - LIST(REMOVE_ITEM SOURCES - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - IF(Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction LIST(REMOVE_ITEM SOURCES + PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp ) @@ -75,7 +68,8 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) # This test currently times out for MSVC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") +# FIXME_SYCL these tests don't compile yet (require parallel_for). +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST( PerfTestExec SOURCES ${SOURCES} @@ -83,17 +77,28 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ) ENDIF() -KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE -) +# FIXME_SYCL +IF(NOT Kokkos_ENABLE_SYCL) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE + ) + +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic_MinMax + SOURCES test_atomic_minmax_simple.cpp + CATEGORIES PERFORMANCE + ) +ENDIF() KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Mempool SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) +ENDIF() IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile index 6d619dc573..ac06c89757 100644 --- a/lib/kokkos/core/perf_test/Makefile +++ b/lib/kokkos/core/perf_test/Makefile @@ -65,6 +65,12 @@ TEST_TARGETS += test-taskdag # +OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o +TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax +TEST_TARGETS += test-atomic-minmax + +# + KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest @@ -77,6 +83,9 @@ KokkosCore_PerformanceTest_Mempool: $(OBJ_MEMPOOL) $(KOKKOS_LINK_DEPENDS) KokkosCore_PerformanceTest_TaskDAG: $(OBJ_TASKDAG) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_TASKDAG) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_TaskDAG +KokkosCore_PerformanceTest_Atomics_MinMax: $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics_MinMax + test-performance: KokkosCore_PerformanceTest ./KokkosCore_PerformanceTest @@ -89,6 +98,9 @@ test-mempool: KokkosCore_PerformanceTest_Mempool test-taskdag: KokkosCore_PerformanceTest_TaskDAG ./KokkosCore_PerformanceTest_TaskDAG +test-atomic-minmax: KokkosCore_PerformanceTest_Atomics_MinMax + ./KokkosCore_PerformanceTest_Atomics_MinMax + build_all: $(TARGETS) test: $(TEST_TARGETS) diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp index 2ea81b5046..66a631e389 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp @@ -120,7 +120,7 @@ void run_resizeview_tests123(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -201,7 +201,7 @@ void run_resizeview_tests45(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -258,7 +258,7 @@ void run_resizeview_tests6(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -311,7 +311,7 @@ void run_resizeview_tests7(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -366,7 +366,7 @@ void run_resizeview_tests8(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp new file mode 100644 index 0000000000..eec1c8eacc --- /dev/null +++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -0,0 +1,244 @@ +// export OMP_PROC_BIND=spread ; export OMP_PLACES=threads +// c++ -O2 -g -DNDEBUG -fopenmp +// ../core/perf_test/test_atomic_minmax_simple.cpp -I../core/src/ -I. -o +// test_atomic_minmax_simple.x containers/src/libkokkoscontainers.a +// core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1 +// ./test_atomic_minmax_simple.x 10000000 + +#include +#include + +#include +#include + +#include +#include + +using exec_space = Kokkos::DefaultExecutionSpace; + +template +void test(const int length) { + Kokkos::Impl::Timer timer; + + using vector = Kokkos::View; + + vector inp("input", length); + T max = std::numeric_limits::max(); + T min = std::numeric_limits::lowest(); + + // input is max values - all min atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% min replacements: " << time << std::endl; + } + + // input is min values - all max atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max replacements: " << time << std::endl; + } + + // input is max values - all max atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = max; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max early exits: " << time << std::endl; + } + + // input is min values - all min atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = min; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl; + } + std::cout << "Time for 100% min early exits: " << time << std::endl; + } + + // limit iterations for contentious test, takes ~50x longer for same length + auto con_length = length / 5; + // input is min values - some max atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + T current(0); + timer.reset(); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1); + if (i == con_length - 1) { + Kokkos::atomic_max_fetch(&(inp(0)), max); + inner = max; + } + }, + Kokkos::Max(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current < max) { + std::cerr << "Error in contentious max replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max + << std::endl; + } + std::cout << "Time for contentious max " << con_length + << " replacements: " << time << std::endl; + } + + // input is max values - some min atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + T current(100000000); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1); + if (i == con_length - 1) { + Kokkos::atomic_min_fetch(&(inp(0)), min); + inner = min; + } + }, + Kokkos::Min(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current > min) { + std::cerr << "Error in contentious min replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min + << std::endl; + } + std::cout << "Time for contentious min " << con_length + << " replacements: " << time << std::endl; + } +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int length = 1000000; + if (argc == 2) { + length = std::stoi(argv[1]); + } + + if (length < 1) { + throw std::invalid_argument(""); + } + + std::cout << "================ int" << std::endl; + test(length); + std::cout << "================ long" << std::endl; + test(length); + std::cout << "================ long long" << std::endl; + test(length); + + std::cout << "================ unsigned int" << std::endl; + test(length); + std::cout << "================ unsigned long" << std::endl; + test(length); + std::cout << "================ unsigned long long" << std::endl; + test(length); + + std::cout << "================ float" << std::endl; + test(length); + std::cout << "================ double" << std::endl; + test(length); + } + Kokkos::finalize(); + return 0; +} diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index b4051dc57f..e0590a78a4 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -19,10 +19,6 @@ SET(KOKKOS_CORE_HEADERS) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -IF (KOKKOS_ENABLE_ROCM) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/ROCm/*.cpp) -ENDIF() - IF (KOKKOS_ENABLE_CUDA) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) @@ -64,6 +60,11 @@ ELSE() LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp) ENDIF() +IF (KOKKOS_ENABLE_SYCL) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +ENDIF() + KOKKOS_ADD_LIBRARY( kokkoscore SOURCES ${KOKKOS_CORE_SRCS} diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp deleted file mode 100644 index 6feaed80e1..0000000000 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp +++ /dev/null @@ -1,1397 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP -#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP - -#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) - -#include -#include - -#include - -//#include -// Including the file above, leads to following type of errors: -// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete -// type is not allowed As a result, recreate cuda_parallel_launch and associated -// code - -#include -#include - -namespace Kokkos { -namespace Impl { - -// ------------------------------------------------------------------ // - -template -__global__ static void cuda_parallel_launch(const DriverType driver) { - driver(); -} - -template -struct CudaLaunch { - inline CudaLaunch(const DriverType& driver, const dim3& grid, - const dim3& block) { - cuda_parallel_launch<<>>(driver); - } -}; - -// ------------------------------------------------------------------ // -template -struct apply_impl; - -// Rank 2 -// Specializations for void tag type -template -struct apply_impl<2, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1); - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - m_func(offset_0, offset_1); - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<2, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - // Loop over size maxnumblocks until full range covered - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 3 -// Specializations for void tag type -template -struct apply_impl<3, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.z < m_rp.m_tile[2]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct apply_impl<3, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.z < m_rp.m_tile[2]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 4 -// Specializations for void tag type -template -struct apply_impl<4, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; - tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - // LR - else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - threadIdx.z < m_rp.m_tile[3]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct apply_impl<4, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; - tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - threadIdx.z < m_rp.m_tile[3]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 5 -// Specializations for void tag type -template -struct apply_impl<5, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - threadIdx.z < m_rp.m_tile[4]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<5, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - threadIdx.z < m_rp.m_tile[4]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 6 -// Specializations for void tag type -template -struct apply_impl<6, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = blockIdx.z % numbl4; - const index_type tile_id5 = blockIdx.z / numbl4; - const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = blockIdx.z / numbl5; - const index_type tile_id5 = blockIdx.z % numbl5; - const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<6, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = blockIdx.z % numbl4; - const index_type tile_id5 = blockIdx.z / numbl4; - const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = blockIdx.z / numbl5; - const index_type tile_id5 = blockIdx.z % numbl5; - const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// ---------------------------------------------------------------------------------- - -template -struct DeviceIterateTile { - using index_type = typename RP::index_type; - using array_index_type = typename RP::array_index_type; - using point_type = typename RP::point_type; - - struct VoidDummy {}; - using usable_tag = typename std::conditional::value, - VoidDummy, Tag>::type; - - DeviceIterateTile(const RP& rp, const Functor& func) - : m_rp{rp}, m_func{func} {} - - private: - inline __device__ void apply() const { - apply_impl(m_rp, m_func).exec_range(); - } // end apply - - public: - inline __device__ void operator()(void) const { this->apply(); } - - inline void execute() const { - const array_index_type maxblocks = - 65535; // not true for blockIdx.x for newer archs - if (RP::rank == 2) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - const dim3 grid( - std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - maxblocks), - std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - maxblocks), - 1); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - maxblocks), - std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - maxblocks), - std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y, - maxblocks), - std::min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), - static_cast(maxblocks)), - std::min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]), - static_cast(maxblocks))); - CudaLaunch(*this, grid, block); - } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - Kokkos::abort("Aborting"); - } - - } // end execute - - protected: - const RP m_rp; - const Functor m_func; -}; - -} // namespace Impl -} // namespace Kokkos - -#endif -#endif diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp deleted file mode 100644 index 0425fe6ed5..0000000000 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ /dev/null @@ -1,3063 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP -#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP - -#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) - -#include - -#include - -// #include -// Including the file above leads to following type of errors: -// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete -// type is not allowed use existing Kokkos functionality, e.g. max blocks, once -// resolved - -#include -#include - -namespace Kokkos { -namespace Impl { - -namespace Refactor { - -// ------------------------------------------------------------------ // -// ParallelFor iteration pattern -template -struct DeviceIterateTile; - -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile<2, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1); - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - m_func(offset_0, offset_1); - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<2, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - // Loop over size maxnumblocks until full range covered - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile<3, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile<3, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile<4, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - // LR - else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile<4, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile<5, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<5, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile<6, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z % numbl4; - const index_type tile_id5 = (index_type)blockIdx.z / numbl4; - const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z / numbl5; - const index_type tile_id5 = (index_type)blockIdx.z % numbl5; - const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<6, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z % numbl4; - const index_type tile_id5 = (index_type)blockIdx.z / numbl4; - const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z / numbl5; - const index_type tile_id5 = (index_type)blockIdx.z % numbl5; - const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -} // namespace Refactor - -// ---------------------------------------------------------------------------------- - -namespace Reduce { - -template -using is_void = std::is_same; - -template -struct is_array_type : std::false_type { - using value_type = T; -}; - -template -struct is_array_type : std::true_type { - using value_type = T; -}; - -template -struct is_array_type : std::true_type { - using value_type = T; -}; - -// ------------------------------------------------------------------ // -template -struct DeviceIterateTile; - -// ParallelReduce iteration pattern -// Scalar reductions - -// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of -// tiles and reduction algorithm constraints extract n-dim tile offsets (i.e. -// tile's global starting mulit-index) from the tileid = blockid using tile -// dimensions local indices within a tile extracted from (index_type)threadIdx.x -// using tile dims, constrained by blocksize combine tile and local id info for -// multi-dim global ids - -// Pattern: -// Each block+thread is responsible for a tile+local_id combo (additional when -// striding by num_blocks) -// 1. create offset arrays -// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max -// num blocks) -// 3. temps set for tile_idx and thrd_idx, which will be modified -// 4. if LL vs LR: -// determine tile starting point offsets (multidim) -// determine local index offsets (multidim) -// concatentate tile offset + local offset for global multi-dim index -// if offset withinin range bounds AND local offset within tile bounds, call -// functor - -// ValueType = T -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile< - 2, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - // Deduce this blocks tile_id - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 2, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile< - 5, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 5, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile< - 6, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 6, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// ValueType = T[], T* -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile< - 2, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 2, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - } // end for loop over num_tiles - product of tiles in each direction - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile< - 5, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 5, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile< - 6, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 6, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -} // namespace Reduce - -// ---------------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - -#endif -#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index cbe1a7e74a..4a30c914f0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -146,9 +146,9 @@ void CudaSpace::access_error(const void *const) { bool CudaUVMSpace::available() { #if defined(CUDA_VERSION) && !defined(__APPLE__) - enum { UVM_available = true }; + enum : bool { UVM_available = true }; #else - enum { UVM_available = false }; + enum : bool { UVM_available = false }; #endif return UVM_available; } @@ -201,8 +201,15 @@ CudaHostPinnedSpace::CudaHostPinnedSpace() {} void *CudaSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } + void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; auto error_code = cudaMalloc(&ptr, arg_alloc_size); @@ -219,9 +226,7 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } @@ -231,6 +236,12 @@ void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { } void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaUVMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; Cuda::impl_static_fence(); @@ -260,19 +271,22 @@ void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } - void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } void *CudaHostPinnedSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaHostPinnedSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); @@ -288,9 +302,7 @@ void *CudaHostPinnedSpace::allocate(const char *arg_label, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } @@ -304,12 +316,17 @@ void CudaSpace::deallocate(void *const arg_alloc_ptr, void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { @@ -327,13 +344,21 @@ void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, , const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaUVMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size + + , + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { Cuda::impl_static_fence(); if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { if (arg_alloc_ptr != nullptr) { @@ -349,17 +374,22 @@ void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const { deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); } - void CudaHostPinnedSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void CudaHostPinnedSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); @@ -375,7 +405,7 @@ void CudaHostPinnedSpace::deallocate(const char *arg_label, namespace Kokkos { namespace Impl { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord SharedAllocationRecord::s_root_record; @@ -551,7 +581,7 @@ SharedAllocationRecord::SharedAllocationRecord( // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif Impl::checked_allocation_with_header(arg_space, arg_label, @@ -582,7 +612,7 @@ SharedAllocationRecord::SharedAllocationRecord( // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif Impl::checked_allocation_with_header(arg_space, arg_label, @@ -610,7 +640,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -830,7 +860,7 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord *r = &s_root_record; char buffer[256]; @@ -896,7 +926,7 @@ void SharedAllocationRecord::print_records( #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } @@ -904,13 +934,13 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord::print_host_accessible_records( s, "CudaUVM", &s_root_record, detail); #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } @@ -918,13 +948,13 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord::print_host_accessible_records( s, "CudaHostPinned", &s_root_record, detail); #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 5a143fd267..0d6d3bdb3a 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -198,6 +198,39 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, LaunchBounds{}); } +// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) +// NOTE these number can be obtained several ways: +// * One option is to download the CUDA Occupancy Calculator spreadsheet, select +// "Compute Capability" first and check what is the smallest "Shared Memory +// Size Config" that is available. The "Shared Memory Per Multiprocessor" in +// bytes is then to be found below in the summary. +// * Another option would be to look for the information in the "Tuning +// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in +// the "Shared Memory" section (more tedious) +inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { + int const compute_capability = properties.major * 10 + properties.minor; + return [compute_capability]() { + switch (compute_capability) { + case 30: + case 32: + case 35: return 16; + case 37: return 80; + case 50: + case 53: + case 60: + case 62: return 64; + case 52: + case 61: return 96; + case 70: + case 80: return 8; + case 75: return 32; + default: + Kokkos::Impl::throw_runtime_exception( + "Unknown device in cuda block size deduction"); + } + return 0; + }() * 1024; +} } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp new file mode 100644 index 0000000000..d6fadd82c0 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP + +#include + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete +#include // SharedAllocationRecord + +#include +#include +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { + private: + using base_t = + typename PatternImplSpecializationFromTag::type; + using size_type = Kokkos::Cuda::size_type; + // These are really functioning as optional references, though I'm not sure + // that the cudaGraph_t one needs to be since it's a pointer under the + // covers and we're not modifying it + Kokkos::ObservingRawPtr m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; + // Note: owned pointer to CudaSpace memory (used for global memory launches), + // which we're responsible for deallocating, but not responsible for calling + // its destructor. + using Record = Kokkos::Impl::SharedAllocationRecord; + // Basically, we have to make this mutable for the same reasons that the + // global kernel buffers in the Cuda instance are mutable... + mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + + // TODO Ensure the execution space of the graph is the same as the one + // attached to the policy? + // TODO @graph kernel name info propagation + template + GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + // This is super ugly, but it works for now and is the most minimal change + // to the codebase for now... + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + // FIXME @graph Forward through the instance once that works in the backends + template + GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, + PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", ex, std::move(arg_functor), + (PolicyDeduced &&) arg_policy) {} + + ~GraphNodeKernelImpl() { + if (m_driver_storage) { + // We should be the only owner, but this is still the easiest way to + // allocate and deallocate aligned memory for these sorts of things + Record::decrement(Record::get_record(m_driver_storage)); + } + } + + void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { + m_graph_ptr = arg_graph_ptr; + } + void set_cuda_graph_node_ptr(cudaGraphNode_t* arg_node_ptr) { + m_graph_node_ptr = arg_node_ptr; + } + cudaGraphNode_t* get_cuda_graph_node_ptr() const { return m_graph_node_ptr; } + cudaGraph_t const* get_cuda_graph_ptr() const { return m_graph_ptr; } + + Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + KOKKOS_EXPECTS(m_driver_storage == nullptr) + + auto* record = Record::allocate( + Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", + sizeof(base_t)); + + Record::increment(record); + m_driver_storage = reinterpret_cast(record->data()); + KOKKOS_ENSURES(m_driver_storage != nullptr) + return m_driver_storage; + } +}; + +struct CudaGraphNodeAggregateKernel { + using graph_kernel = CudaGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kerenls, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template ::type> +struct get_graph_node_kernel_type + : identity> {}; +template +struct get_graph_node_kernel_type + : identity> {}; + +//============================================================================== +// {{{1 + +template +auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + return kernel_as_graph_kernel.allocate_driver_memory_buffer(); +} + +template +auto const& get_cuda_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + cudaGraph_t const* graph_ptr = kernel_as_graph_kernel.get_cuda_graph_ptr(); + KOKKOS_EXPECTS(graph_ptr != nullptr); + return *graph_ptr; +} + +template +auto& get_cuda_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + auto* graph_node_ptr = kernel_as_graph_kernel.get_cuda_graph_node_ptr(); + KOKKOS_EXPECTS(graph_node_ptr != nullptr); + return *graph_node_ptr; +} + +// end get_cuda_graph_*() helper functions }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp similarity index 52% rename from lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp rename to lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp index 989a4aec90..f4539cd2ca 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp @@ -42,85 +42,62 @@ //@HEADER */ -#include +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP + #include -#if !defined(KOKKOS_ROCM_INVOKE_H) -#define KOKKOS_ROCM_INVOKE_H +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete + +#include +#include namespace Kokkos { namespace Impl { -template ()), int>::type = 0> -KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs) { - f(Tag(), static_cast(xs)...); -} +template <> +struct GraphNodeBackendSpecificDetails { + cudaGraphNode_t node = nullptr; -template ()), int>::type = 0> -KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs) { - f(static_cast(xs)...); -} + //---------------------------------------------------------------------------- + // {{{2 -template -struct rocm_invoke_fn { - F* f; - rocm_invoke_fn(F& f_) : f(&f_) {} + explicit GraphNodeBackendSpecificDetails() = default; - template - KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const { - rocm_invoke(*f, static_cast(xs)...); - } + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} + + // end Ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- }; -template -KOKKOS_INLINE_FUNCTION rocm_invoke_fn make_rocm_invoke_fn(F& f) { - return {f}; -} +template +struct GraphNodeBackendDetailsBeforeTypeErasure { + protected: + //---------------------------------------------------------------------------- + // {{{2 -template -KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x) { - return x; -} + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, Kernel&, PredecessorRef const&, + GraphNodeBackendSpecificDetails&) noexcept {} -template -KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper x) { - return x; -} + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails&) noexcept {} -template -struct rocm_capture_fn { - F f; - T data; - - KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x) : f(f_), data(x) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const { - f(rocm_unwrap(data), static_cast(xs)...); - } + // end ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- }; -template -KOKKOS_INLINE_FUNCTION rocm_capture_fn rocm_capture(F f, T x) { - return {f, x}; -} +} // end namespace Impl +} // end namespace Kokkos -template -KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) - -> decltype(rocm_capture(rocm_capture(f, x), y, xs...)) { - return rocm_capture(rocm_capture(f, x), y, xs...); -} +#include -struct rocm_apply_op { - template - KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const { - f(static_cast(xs)...); - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp new file mode 100644 index 0000000000..3de7a69916 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP + +#include + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete + +// GraphNodeImpl needs to be complete because GraphImpl here is a full +// specialization and not just a partial one +#include +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template <> +struct GraphImpl { + public: + using execution_space = Kokkos::Cuda; + + private: + execution_space m_execution_space; + cudaGraph_t m_graph = nullptr; + cudaGraphExec_t m_graph_exec = nullptr; + + using cuda_graph_flags_t = unsigned int; + + using node_details_t = GraphNodeBackendSpecificDetails; + + void _instantiate_graph() { + constexpr size_t error_log_size = 256; + cudaGraphNode_t error_node = nullptr; + char error_log[error_log_size]; + CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node, + error_log, error_log_size)); + // TODO @graphs print out errors + } + + public: + using root_node_impl_t = + GraphNodeImpl; + using aggregate_kernel_impl_t = CudaGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl; + + // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Graph object + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() { + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + m_execution_space.fence(); + KOKKOS_EXPECTS(bool(m_graph)) + if (bool(m_graph_exec)) { + CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec)); + } + CUDA_SAFE_CALL(cudaGraphDestroy(m_graph)); + }; + + explicit GraphImpl(Kokkos::Cuda arg_instance) + : m_execution_space(std::move(arg_instance)) { + CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0})); + } + + void add_node(std::shared_ptr const& arg_node_ptr) { + // All of the predecessors are just added as normal, so all we need to + // do here is add an empty node + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), + m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + + template + // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl + // Also requires that the kernel has the graph node tag in it's policy + void add_node(std::shared_ptr const& arg_node_ptr) { + static_assert( + NodeImpl::kernel_type::Policy::is_graph_kernel::value, + "Something has gone horribly wrong, but it's too complicated to " + "explain here. Buy Daisy a coffee and she'll explain it to you."); + KOKKOS_EXPECTS(bool(arg_node_ptr)); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + // note: using arg_node_ptr->node_details_t::node caused an ICE in NVCC 10.1 + auto& cuda_node = static_cast(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!bool(cuda_node)); + kernel.set_cuda_graph_ptr(&m_graph); + kernel.set_cuda_graph_node_ptr(&cuda_node); + kernel.execute(); + KOKKOS_ENSURES(bool(cuda_node)); + } + + template + // requires PredecessorRef is a specialization of GraphNodeRef that has + // already been added to this graph and NodeImpl is a specialization of + // GraphNodeImpl that has already been added to this graph. + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(bool(arg_node_ptr)) + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(bool(pred_ptr)) + + // clang-format off + // NOTE const-qualifiers below are commented out because of an API break + // from CUDA 10.0 to CUDA 10.1 + // cudaGraphAddDependencies(cudaGraph_t, cudaGraphNode_t*, cudaGraphNode_t*, size_t) + // cudaGraphAddDependencies(cudaGraph_t, const cudaGraphNode_t*, const cudaGraphNode_t*, size_t) + // clang-format on + auto /*const*/& pred_cuda_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(pred_cuda_node)) + + auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(cuda_node)) + + CUDA_SAFE_CALL( + cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1)); + } + + void submit() { + if (!bool(m_graph_exec)) { + _instantiate_graph(); + } + CUDA_SAFE_CALL( + cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream())); + } + + execution_space const& get_execution_space() const noexcept { + return m_execution_space; + } + + auto create_root_node_ptr() { + KOKKOS_EXPECTS(bool(m_graph)) + KOKKOS_EXPECTS(!bool(m_graph_exec)) + auto rv = std::make_shared( + get_execution_space(), _graph_node_is_root_ctor_tag{}); + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + KOKKOS_ENSURES(bool(rv->node_details_t::node)) + return rv; + } + + template + // See requirements/expectations in GraphBuilder + auto create_aggregate_ptr(PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared( + m_execution_space, _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp new file mode 100644 index 0000000000..a9a62380e5 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -0,0 +1,710 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_HALF_HPP_ +#define KOKKOS_CUDA_HALF_HPP_ + +#include +#ifdef KOKKOS_ENABLE_CUDA +#if !(defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 900) && \ + !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) || \ + defined(KOKKOS_ARCH_MAXWELL52)) +#include + +#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED +// Make sure no one else tries to define half_t +#define KOKKOS_IMPL_HALF_TYPE_DEFINED + +namespace Kokkos { +namespace Impl { +struct half_impl_t { + using type = __half; +}; +} // namespace Impl +namespace Experimental { + +// Forward declarations +class half_t; + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val); + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); + +class half_t { + public: + using impl_type = Kokkos::Impl::half_impl_t::type; + + private: + impl_type val; + + public: + KOKKOS_FUNCTION + half_t() : val(0.0F) {} + + // Don't support implicit conversion back to impl_type. + // impl_type is a storage only type on host. + KOKKOS_FUNCTION + explicit operator impl_type() const { return val; } + KOKKOS_FUNCTION + explicit operator float() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator bool() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator double() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator short() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator int() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator long() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator long long() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned short() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned int() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long long() const { + return cast_from_half(*this); + } + + KOKKOS_FUNCTION + half_t(impl_type rhs) : val(rhs) {} + KOKKOS_FUNCTION + explicit half_t(float rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(double rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + + // Unary operators + KOKKOS_FUNCTION + half_t operator+() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = +tmp.val; +#else + tmp.val = __float2half(+__half2float(tmp.val)); +#endif + return tmp; + } + + KOKKOS_FUNCTION + half_t operator-() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = -tmp.val; +#else + tmp.val = __float2half(-__half2float(tmp.val)); +#endif + return tmp; + } + + // Prefix operators + KOKKOS_FUNCTION + half_t& operator++() { +#ifdef __CUDA_ARCH__ + ++val; +#else + float tmp = __half2float(val); + ++tmp; + val = __float2half(tmp); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator--() { +#ifdef __CUDA_ARCH__ + --val; +#else + float tmp = __half2float(val); + --tmp; + val = __float2half(tmp); +#endif + return *this; + } + + // Postfix operators + KOKKOS_FUNCTION + half_t operator++(int) { + half_t tmp = *this; + operator++(); + return tmp; + } + + KOKKOS_FUNCTION + half_t operator--(int) { + half_t tmp = *this; + operator--(); + return tmp; + } + + // Binary operators + KOKKOS_FUNCTION + half_t& operator=(impl_type rhs) { + val = rhs; + return *this; + } + + template + KOKKOS_FUNCTION half_t& operator=(T rhs) { + val = cast_to_half(rhs).val; + return *this; + } + + // Compound operators + KOKKOS_FUNCTION + half_t& operator+=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val += rhs.val; +#else + val = __float2half(__half2float(val) + __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val -= rhs.val; +#else + val = __float2half(__half2float(val) - __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val *= rhs.val; +#else + val = __float2half(__half2float(val) * __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val /= rhs.val; +#else + val = __float2half(__half2float(val) / __half2float(rhs.val)); +#endif + return *this; + } + + // Binary Arithmetic + KOKKOS_FUNCTION + half_t friend operator+(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val += rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator-(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val -= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator*(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val *= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator/(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val /= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); +#endif + return lhs; + } + + // Logical operators + KOKKOS_FUNCTION + bool operator!() const { +#ifdef __CUDA_ARCH__ + return static_cast(!val); +#else + return !__half2float(val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator&&(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val && rhs.val); +#else + return __half2float(val) && __half2float(rhs.val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator||(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val || rhs.val); +#else + return __half2float(val) || __half2float(rhs.val); +#endif + } + + // Comparison operators + KOKKOS_FUNCTION + bool operator==(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val == rhs.val); +#else + return __half2float(val) == __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator!=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val != rhs.val); +#else + return __half2float(val) != __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val < rhs.val); +#else + return __half2float(val) < __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val > rhs.val); +#else + return __half2float(val) > __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val <= rhs.val); +#else + return __half2float(val) <= __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val >= rhs.val); +#else + return __half2float(val) >= __half2float(rhs.val); +#endif + } +}; + +// CUDA before 11.1 only has the half <-> float conversions marked host device +// So we will largely convert to float on the host for conversion +// But still call the correct functions on the device +#if (CUDA_VERSION < 11100) + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(half_t val) { return val; } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return half_t(__float2half(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val) { return cast_to_half(static_cast(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { + // double2half was only introduced in CUDA 11 too + return half_t(__float2half(static_cast(val))); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { +#ifdef __CUDA_ARCH__ + return half_t(__short2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { +#ifdef __CUDA_ARCH__ + return half_t(__ushort2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { +#ifdef __CUDA_ARCH__ + return half_t(__int2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { +#ifdef __CUDA_ARCH__ + return half_t(__uint2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ll2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ull2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast(val)); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2float(half_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(__half2float(half_t::impl_type(val))); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2short_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ushort_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2int_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2uint_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ll_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ull_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +#else // CUDA 11.1 versions follow + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return __float2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { return __double2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { return __short2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { return __ushort2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { return __int2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { return __uint2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { return __ll2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { return __ull2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast(val)); +} +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2float(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2double(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2short_rz(val); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return __half2ushort_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2int_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2uint_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2ll_rz(val); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return __half2ull_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} +#endif +} // namespace Experimental +} // namespace Kokkos +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED +#endif // KOKKOS_ENABLE_CUDA +#endif // Disables for half_t on cuda: + // Clang/8||KEPLER30||KEPLER32||KEPLER37||MAXWELL50||MAXWELL52 +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 3e5042a593..b8e8163458 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -132,7 +132,7 @@ int cuda_kernel_arch() { bool cuda_launch_blocking() { const char *env = getenv("CUDA_LAUNCH_BLOCKING"); - if (env == 0) return false; + if (env == nullptr) return false; return std::stoi(env); } @@ -509,14 +509,14 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { const char *env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC"); bool force_device_alloc; - if (env_force_device_alloc == 0) + if (env_force_device_alloc == nullptr) force_device_alloc = false; else force_device_alloc = std::stoi(env_force_device_alloc) != 0; const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES"); bool visible_devices_one = true; - if (env_visible_devices == 0) visible_devices_one = false; + if (env_visible_devices == nullptr) visible_devices_one = false; if (Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc)) { @@ -893,6 +893,92 @@ const cudaDeviceProp &Cuda::cuda_device_prop() const { return m_space_instance->m_deviceProp; } +namespace Impl { + +int get_gpu(const InitArguments &args); + +int g_cuda_space_factory_initialized = + initialize_space_factory("150_Cuda"); + +void CudaSpaceInitializer::initialize(const InitArguments &args) { + int use_gpu = get_gpu(args); + if (std::is_same::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Cuda::impl_initialize(Kokkos::Cuda::SelectDevice(use_gpu)); + } else { + Kokkos::Cuda::impl_initialize(); + } + } +} + +void CudaSpaceInitializer::finalize(bool all_spaces) { + if ((std::is_same::value || + all_spaces) && + Kokkos::Cuda::impl_is_initialized()) { + Kokkos::Cuda::impl_finalize(); + } +} + +void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } + +void CudaSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "Device Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA: "; + msg << "yes" << std::endl; + + msg << "Cuda Atomics:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; +#ifdef KOKKOS_ENABLE_CUDA_ATOMICS + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "Cuda Options:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; +#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_UVM: "; +#ifdef KOKKOS_ENABLE_CUDA_UVM + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUSPARSE: "; +#ifdef KOKKOS_ENABLE_CUSPARSE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nCuda Runtime Configuration:" << std::endl; + Cuda::print_configuration(msg, detail); +} +} // namespace Impl + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 56f3f71794..13773d70c5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -34,7 +34,9 @@ struct CudaTraits { enum : CudaSpace::size_type { KernelArgumentLimit = 0x001000 /* 4k bytes */ }; - + enum : CudaSpace::size_type { + MaxHierarchicalParallelism = 1024 /* team_size * vector_length */ + }; using ConstantGlobalBufferType = unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index dfd179c79c..39404e0bf3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -48,20 +48,23 @@ #include #ifdef KOKKOS_ENABLE_CUDA +#include #include #include +#include #include #include #include #include #include #include +#include +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#if defined(__CUDACC__) - /** \brief Access to constant memory on the device */ #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE @@ -140,29 +143,85 @@ __global__ __launch_bounds__( driver->operator()(); } -template -__global__ static void cuda_parallel_launch_constant_or_global_memory( - const DriverType* driver_ptr) { - const DriverType& driver = - driver_ptr != nullptr - ? *driver_ptr - : *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); +//============================================================================== +// {{{1 - driver(); +inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { + return (grid.x == 0) || ((block.x * block.y * block.z) == 0); } -template -__global__ -__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_or_global_memory( - const DriverType* driver_ptr) { - const DriverType& driver = - driver_ptr != nullptr - ? *driver_ptr - : *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); - - driver(); +inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { + if (cuda_instance->m_maxShmemPerBlock < shmem) { + Kokkos::Impl::throw_runtime_exception( + std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" + " memory request is too large")); + } } +template +inline void configure_shmem_preference(KernelFuncPtr const& func, + bool prefer_shmem) { +#ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + auto set_cache_config = [&] { + CUDA_SAFE_CALL(cudaFuncSetCacheConfig( + func, + (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); + return prefer_shmem; + }; + static bool cache_config_preference_cached = set_cache_config(); + if (cache_config_preference_cached != prefer_shmem) { + cache_config_preference_cached = set_cache_config(); + } +#else + // Use the parameters so we don't get a warning + (void)func; + (void)prefer_shmem; +#endif +} + +template +std::enable_if_t +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const& policy, cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, + bool& prefer_shmem) { + int const block_size = block.x * block.y * block.z; + int const desired_occupancy = policy.impl_get_desired_occupancy().value(); + + size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); + size_t const static_shmem = attributes.sharedSizeBytes; + + // round to nearest integer and avoid division by zero + int active_blocks = std::max( + 1, static_cast(std::round( + static_cast(properties.maxThreadsPerMultiProcessor) / + block_size * desired_occupancy / 100))); + int const dynamic_shmem = + shmem_per_sm_prefer_l1 / active_blocks - static_shmem; + + if (dynamic_shmem > shmem) { + shmem = dynamic_shmem; + prefer_shmem = false; + } +} + +template +std::enable_if_t +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, + dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} + +// end Some helper functions for launch code readability }}}1 +//============================================================================== + +//============================================================================== +// {{{2 + +// Use local memory up to ConstantMemoryUseThreshold +// Use global memory above ConstantMemoryUsage +// In between use ConstantMemory + template struct DeduceCudaLaunchMechanism { constexpr static const Kokkos::Experimental::WorkItemProperty:: @@ -217,408 +276,369 @@ struct DeduceCudaLaunchMechanism { : Experimental::CudaLaunchMechanism::GlobalMemory) : (default_launch_mechanism)); }; -// Use local memory up to ConstantMemoryUseThreshold -// Use global memory above ConstantMemoryUsage -// In between use ConstantMemory -template , - Experimental::CudaLaunchMechanism LaunchMechanism = - DeduceCudaLaunchMechanism::launch_mechanism> -struct CudaParallelLaunch; + +// end DeduceCudaLaunchMechanism }}}2 +//============================================================================== + +//============================================================================== +// {{{1 + +// Base classes that summarize the differences between the different launch +// mechanisms + +template +struct CudaParallelLaunchKernelFunc; + +template +struct CudaParallelLaunchKernelInvoker; + +//------------------------------------------------------------------------------ +// {{{2 template -struct CudaParallelLaunch< - DriverType, Kokkos::LaunchBounds, - Experimental::CudaLaunchMechanism::ConstantMemory> { - static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, - "Kokkos Error: Requested CudaLaunchConstantMemory with a " - "Functor larger than 32kB."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_constant_memory< - DriverType, MaxThreadsPerBlock, MinBlocksPerSM>, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Wait until the previous kernel that uses the constant buffer is done - CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); - - // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; - memcpy(staging, &driver, sizeof(DriverType)); - - // Copy functor asynchronously from there to constant memory on the device - cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, - sizeof(DriverType), 0, cudaMemcpyHostToDevice, - cudaStream_t(cuda_instance->m_stream)); - - // Invoke the driver function on the device - cuda_parallel_launch_constant_memory - <<m_stream>>>(); - - // Record an event that says when the constant buffer can be reused - CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, - cudaStream_t(cuda_instance->m_stream))); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_constant_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } -}; - -template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::ConstantMemory> { - static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, - "Kokkos Error: Requested CudaLaunchConstantMemory with a " - "Functor larger than 32kB."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_constant_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Wait until the previous kernel that uses the constant buffer is done - CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); - - // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; - memcpy(staging, &driver, sizeof(DriverType)); - - // Copy functor asynchronously from there to constant memory on the device - cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, - sizeof(DriverType), 0, cudaMemcpyHostToDevice, - cudaStream_t(cuda_instance->m_stream)); - - // Invoke the driver function on the device - cuda_parallel_launch_constant_memory - <<m_stream>>>(); - - // Record an event that says when the constant buffer can be reused - CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, - cudaStream_t(cuda_instance->m_stream))); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_constant_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } -}; - -template -struct CudaParallelLaunch< +struct CudaParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory> { - static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit, - "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor " - "larger than 4096 bytes."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_local_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Invoke the driver function on the device - cuda_parallel_launch_local_memory - <<m_stream>>>(driver); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_local_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_local_memory; } }; template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::LocalMemory> { +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::LocalMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_local_memory; + } +}; + +//------------------------------------------------------------------------------ + +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::LocalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>; static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit, "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor " "larger than 4096 bytes."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_local_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + (base_t:: + get_kernel_func())<<m_stream>>>( + driver); + } - // Invoke the driver function on the device - cuda_parallel_launch_local_memory - <<m_stream>>>(driver); +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference( + base_t::get_kernel_func(), prefer_shmem); + + void const* args[] = {&driver}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } + KOKKOS_ENSURES(bool(graph_node)) } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_local_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } +#endif }; +// end local memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// {{{2 + template -struct CudaParallelLaunch< +struct CudaParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory> { - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_global_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - DriverType* driver_ptr = nullptr; - driver_ptr = reinterpret_cast( - cuda_instance->scratch_functor(sizeof(DriverType))); - cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), - cudaMemcpyDefault, cuda_instance->m_stream); - - // Invoke the driver function on the device - cuda_parallel_launch_global_memory - <<m_stream>>>(driver_ptr); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_global_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; + static void* get_kernel_func() { + return cuda_parallel_launch_global_memory; } }; template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::GlobalMemory> { - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_global_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::GlobalMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_global_memory; + } +}; - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); +//------------------------------------------------------------------------------ - DriverType* driver_ptr = nullptr; - driver_ptr = reinterpret_cast( - cuda_instance->scratch_functor(sizeof(DriverType))); +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + DriverType* driver_ptr = reinterpret_cast( + cuda_instance->scratch_functor(sizeof(DriverType))); + + cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault, + cuda_instance->m_stream); + (base_t:: + get_kernel_func())<<m_stream>>>( + driver_ptr); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference( + base_t::get_kernel_func(), prefer_shmem); + + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + + // Unlike in the non-graph case, we can get away with doing an async copy + // here because the `DriverType` instance is held in the GraphNodeImpl + // which is guaranteed to be alive until the graph instance itself is + // destroyed, where there should be a fence ensuring that the allocation + // associated with this kernel on the device side isn't deleted. cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream); - cuda_parallel_launch_global_memory - <<m_stream>>>(driver_ptr); + void const* args[] = {&driver_ptr}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(bool(graph_node)) + } +#endif +}; + +// end Global Memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// {{{2 + +template +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory; + } +}; + +template +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t< + decltype(cuda_parallel_launch_constant_memory)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory; + } +}; + +//------------------------------------------------------------------------------ + +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory>; + static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, + "Kokkos Error: Requested CudaLaunchConstantMemory with a " + "Functor larger than 32kB."); + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + // Wait until the previous kernel that uses the constant buffer is done + CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); + + // Copy functor (synchronously) to staging buffer in pinned host memory + unsigned long* staging = cuda_instance->constantMemHostStaging; + memcpy(staging, &driver, sizeof(DriverType)); + + // Copy functor asynchronously from there to constant memory on the device + cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, + sizeof(DriverType), 0, cudaMemcpyHostToDevice, + cudaStream_t(cuda_instance->m_stream)); + + // Invoke the driver function on the device + (base_t:: + get_kernel_func())<<m_stream>>>(); + + // Record an event that says when the constant buffer can be reused + CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, + cudaStream_t(cuda_instance->m_stream))); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + // Just use global memory; coordinating through events to share constant + // memory with the non-graph interface is not really reasonable since + // events don't work with Graphs directly, and this would anyway require + // a much more complicated structure that finds previous nodes in the + // dependency structure of the graph and creates an implicit dependence + // based on the need for constant memory (which we would then have to + // somehow go and prove was not creating a dependency cycle, and I don't + // even know if there's an efficient way to do that, let alone in the + // structure we currenty have). + using global_launch_impl_t = CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + global_launch_impl_t::create_parallel_launch_graph_node( + driver, grid, block, shmem, cuda_instance, prefer_shmem); + } +#endif +}; + +// end Constant Memory }}}2 +//------------------------------------------------------------------------------ + +// end CudaParallelLaunchKernelInvoker }}}1 +//============================================================================== + +//============================================================================== +// {{{1 + +template +struct CudaParallelLaunchImpl; + +template +struct CudaParallelLaunchImpl< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism> + : CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism> { + using base_t = CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism>; + + inline static void launch_kernel(const DriverType& driver, const dim3& grid, + const dim3& block, int shmem, + const CudaInternal* cuda_instance, + bool prefer_shmem) { + if (!Impl::is_empty_launch(grid, block)) { + // Prevent multiple threads to simultaneously set the cache configuration + // preference and launch the same kernel + static std::mutex mutex; + std::lock_guard lock(mutex); + + Impl::check_shmem_request(cuda_instance, shmem); + + // If a desired occupancy is specified, we compute how much shared memory + // to ask for to achieve that occupancy, assuming that the cache + // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic + // shared memory computed is actually smaller than `shmem` we overwrite + // `shmem` and set `prefer_shmem` to `false`. + modify_launch_configuration_if_desired_occupancy_is_specified( + driver.get_policy(), cuda_instance->m_deviceProp, + get_cuda_func_attributes(), block, shmem, prefer_shmem); + + Impl::configure_shmem_preference< + DriverType, Kokkos::LaunchBounds, + decltype(base_t::get_kernel_func())>(base_t::get_kernel_func(), + prefer_shmem); + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); + cuda_instance->fence(); #endif } } @@ -630,15 +650,63 @@ struct CudaParallelLaunch, // the code and the result is visible. auto wrap_get_attributes = []() -> cudaFuncAttributes { cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_global_memory)); + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); return attr_tmp; }; static cudaFuncAttributes attr = wrap_get_attributes(); return attr; } }; -//---------------------------------------------------------------------------- + +// end CudaParallelLaunchImpl }}}1 +//============================================================================== + +//============================================================================== +// {{{1 + +template , + Experimental::CudaLaunchMechanism LaunchMechanism = + DeduceCudaLaunchMechanism::launch_mechanism, + bool DoGraph = DriverType::Policy::is_graph_kernel::value +#ifndef KOKKOS_CUDA_ENABLE_GRAPHS + && false +#endif + > +struct CudaParallelLaunch; + +// General launch mechanism +template +struct CudaParallelLaunch + : CudaParallelLaunchImpl { + using base_t = + CudaParallelLaunchImpl; + template + CudaParallelLaunch(Args&&... args) { + base_t::launch_kernel((Args &&) args...); + } +}; + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS +// Launch mechanism for creating graph nodes +template +struct CudaParallelLaunch + : CudaParallelLaunchImpl { + using base_t = + CudaParallelLaunchImpl; + template + CudaParallelLaunch(Args&&... args) { + base_t::create_parallel_launch_graph_node((Args &&) args...); + } +}; +#endif + +// end CudaParallelLaunch }}}1 +//============================================================================== } // namespace Impl } // namespace Kokkos @@ -646,6 +714,5 @@ struct CudaParallelLaunch, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* defined( __CUDACC__ ) */ #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDAEXEC_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index 07dadb3c16..ff31649544 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -42,13 +42,10 @@ //@HEADER */ -#include - +#include #ifdef KOKKOS_ENABLE_CUDA - #include #include -#include #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index a4b5d08ccf..7640b8084d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -81,8 +81,6 @@ void finalize_host_cuda_lock_arrays(); } // namespace Impl } // namespace Kokkos -#if defined(__CUDACC__) - namespace Kokkos { namespace Impl { @@ -173,8 +171,6 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif -#endif /* defined( __CUDACC__ ) */ - #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 5dd644746b..131d180980 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -46,7 +46,7 @@ #define KOKKOS_CUDA_PARALLEL_HPP #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -99,6 +99,8 @@ class TeamPolicyInternal int m_team_scratch_size[2]; int m_thread_scratch_size[2]; int m_chunk_size; + bool m_tune_team; + bool m_tune_vector; public: //! Execution space of this execution policy @@ -115,6 +117,8 @@ class TeamPolicyInternal m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; m_space = p.m_space; + m_tune_team = p.m_tune_team; + m_tune_vector = p.m_tune_vector; } //---------------------------------------- @@ -130,10 +134,10 @@ class TeamPolicyInternal Kokkos::Impl::cuda_get_max_block_size( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -171,10 +175,10 @@ class TeamPolicyInternal Kokkos::Impl::cuda_get_opt_block_size( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -234,9 +238,18 @@ class TeamPolicyInternal //---------------------------------------- - inline int vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED inline int vector_length() const { + return impl_vector_length(); + } + inline int impl_vector_length() const { return m_vector_length; } inline int team_size() const { return m_team_size; } inline int league_size() const { return m_league_size; } + inline bool impl_auto_team_size() const { return m_tune_team; } + inline bool impl_auto_vector_length() const { return m_tune_vector; } + inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; } + inline void impl_set_vector_length(size_t vector_length) { + m_vector_length = vector_length; + } inline int scratch_size(int level, int team_size_ = -1) const { if (team_size_ < 0) team_size_ = m_team_size; return m_team_scratch_size[level] + @@ -258,18 +271,25 @@ class TeamPolicyInternal m_vector_length(0), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(32) {} + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(false), + m_tune_vector(false) {} - /** \brief Specify league size, request team size */ + /** \brief Specify league size, specify team size, specify vector length */ TeamPolicyInternal(const execution_space space_, int league_size_, int team_size_request, int vector_length_request = 1) : m_space(space_), m_league_size(league_size_), m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : verify_requested_vector_length(1)), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(32) { + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(bool(team_size_request <= 0)), + m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) Impl::throw_runtime_exception( @@ -277,72 +297,56 @@ class TeamPolicyInternal "space."); // Make sure total block size is permissible - if (m_team_size * m_vector_length > 1024) { + if (m_team_size * m_vector_length > + int(Impl::CudaTraits::MaxHierarchicalParallelism)) { Impl::throw_runtime_exception( std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " "Team size x vector length must be smaller than 1024.")); } } - /** \brief Specify league size, request team size */ + /** \brief Specify league size, request team size, specify vector length */ TeamPolicyInternal(const execution_space space_, int league_size_, const Kokkos::AUTO_t& /* team_size_request */ , int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); - } + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + + /** \brief Specify league size, request team size and vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) {} + + /** \brief Specify league size, specify team size, request vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, const Kokkos::AUTO_t&) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {} TeamPolicyInternal(int league_size_, int team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} - // Make sure total block size is permissible - if (m_team_size * m_vector_length > 1024) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */ - , + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) + + {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} inline int chunk_size() const { return m_chunk_size; } @@ -394,7 +398,7 @@ class TeamPolicyInternal get_cuda_func_attributes(); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double) + ((functor_value_traits::StaticValueSize != 0) @@ -406,7 +410,7 @@ class TeamPolicyInternal int p2 = 1; while (p2 <= block_size) p2 *= 2; p2 /= 2; - return p2 / vector_length(); + return p2 / impl_vector_length(); } template @@ -468,6 +472,8 @@ class ParallelFor, Kokkos::Cuda> { public: using functor_type = FunctorType; + Policy const& get_policy() const { return m_policy; } + inline __device__ void operator()(void) const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -518,7 +524,8 @@ class ParallelFor, Kokkos::Cuda> { template class ParallelFor, Kokkos::Cuda> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; private: using RP = Policy; @@ -530,10 +537,11 @@ class ParallelFor, Kokkos::Cuda> { const Policy m_rp; public: + Policy const& get_policy() const { return m_rp; } + inline __device__ void operator()(void) const { - Kokkos::Impl::Refactor::DeviceIterateTile( - m_rp, m_functor) + Kokkos::Impl::DeviceIterateTile(m_rp, m_functor) .exec_range(); } @@ -621,8 +629,7 @@ class ParallelFor, Kokkos::Cuda> { *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), false); } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - Kokkos::abort("Aborting"); + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); } } // end execute @@ -636,7 +643,7 @@ template class ParallelFor, Kokkos::Cuda> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; private: using Member = typename Policy::member_type; @@ -680,6 +687,8 @@ class ParallelFor, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()(void) const { // Iterate this block through the league int64_t threadid = 0; @@ -749,7 +758,7 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -796,10 +805,10 @@ class ParallelFor, if (int(m_team_size) > int(Kokkos::Impl::cuda_get_max_block_size( m_policy.space().impl_internal_space_instance(), attr, - arg_functor, arg_policy.vector_length(), + arg_functor, arg_policy.impl_vector_length(), arg_policy.team_scratch_size(0), arg_policy.thread_scratch_size(0)) / - arg_policy.vector_length())) { + arg_policy.impl_vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -847,6 +856,7 @@ class ParallelReduce, ReducerType, using functor_type = FunctorType; using size_type = Kokkos::Cuda::size_type; using index_type = typename Policy::index_type; + using reducer_type = ReducerType; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -873,6 +883,8 @@ class ParallelReduce, ReducerType, using DummySHMEMReductionType = int; public: + Policy const& get_policy() const { return m_policy; } + // Make the exec_range calls call to Reduce::DeviceIterateTile template __device__ inline @@ -949,36 +961,44 @@ class ParallelReduce, ReducerType, for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { global[i] = shared[i]; } - } else if (cuda_single_inter_block_reduce_scan( - ReducerConditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_cuda_shared_memory(), - m_scratch_space, m_scratch_flags)) { - // This is the final block with the final result at the final threads' - // location + // return ; + } - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : (m_unified_space ? m_unified_space : m_scratch_space); + if (m_policy.begin() != m_policy.end()) { + { + if (cuda_single_inter_block_reduce_scan( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location - if (threadIdx.y == 0) { - Kokkos::Impl::FunctorFinal::final( - ReducerConditional::select(m_functor, m_reducer), shared); - } + size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); - if (CudaTraits::WarpSize < word_count.value) { - __syncthreads(); - } + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; + i += blockDim.y) { + global[i] = shared[i]; + } + } } } } - /* __device__ inline void run(const DummyShflReductionType&) const { @@ -1055,6 +1075,9 @@ class ParallelReduce, ReducerType, const bool need_device_set = ReduceFunctorHasInit::value || ReduceFunctorHasFinal::value || !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif !std::is_same::value; if ((nwork > 0) || need_device_set) { const int block_size = local_block_size(m_functor); @@ -1077,6 +1100,7 @@ class ParallelReduce, ReducerType, dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, 1); + // TODO @graph We need to effectively insert this in to the graph const int shmem = UseShflReduction ? 0 @@ -1117,6 +1141,7 @@ class ParallelReduce, ReducerType, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1195,6 +1220,7 @@ class ParallelReduce, ReducerType, using reference_type = typename ValueTraits::reference_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + using reducer_type = ReducerType; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -1214,16 +1240,16 @@ class ParallelReduce, ReducerType, // Shall we use the shfl based reduction or not (only use it for static sized // types of more than 128bit - enum { - UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && - (ValueTraits::StaticValueSize != 0)) - }; + static constexpr bool UseShflReduction = false; + //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) // Some crutch to do function overloading private: using DummyShflReductionType = double; using DummySHMEMReductionType = int; public: + Policy const& get_policy() const { return m_policy; } + inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile, ReducerType, // Required grid.x <= block.y const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); + // TODO @graph We need to effectively insert this in to the graph const int shmem = UseShflReduction ? 0 @@ -1403,7 +1430,7 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - Cuda().fence(); + m_policy.space().fence(); if (m_result_ptr) { if (m_unified_space) { @@ -1421,6 +1448,7 @@ class ParallelReduce, ReducerType, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1464,7 +1492,7 @@ template class ParallelReduce, ReducerType, Kokkos::Cuda> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; private: using Member = typename Policy::member_type; @@ -1491,8 +1519,11 @@ class ParallelReduce, public: using functor_type = FunctorType; using size_type = Cuda::size_type; + using reducer_type = ReducerType; - enum { UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) }; + enum : bool { + UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) + }; private: using DummyShflReductionType = double; @@ -1539,6 +1570,8 @@ class ParallelReduce, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()() const { int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -1631,31 +1664,35 @@ class ParallelReduce, for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { global[i] = shared[i]; } - } else if (cuda_single_inter_block_reduce_scan( - ReducerConditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_cuda_shared_memory(), - m_scratch_space, m_scratch_flags)) { - // This is the final block with the final result at the final threads' - // location + } - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : (m_unified_space ? m_unified_space : m_scratch_space); + if (m_league_size != 0) { + if (cuda_single_inter_block_reduce_scan( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location - if (threadIdx.y == 0) { - Kokkos::Impl::FunctorFinal::final( - ReducerConditional::select(m_functor, m_reducer), shared); - } + size_type* const shared = kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); - if (CudaTraits::WarpSize < word_count.value) { - __syncthreads(); - } + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } } } } @@ -1717,6 +1754,9 @@ class ParallelReduce, const bool need_device_set = ReduceFunctorHasInit::value || ReduceFunctorHasFinal::value || !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif !std::is_same::value; if ((nwork > 0) || need_device_set) { const int block_count = @@ -1770,6 +1810,7 @@ class ParallelReduce, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1800,7 +1841,7 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -1838,7 +1879,7 @@ class ParallelReduce, // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for CUDA for dynamic " @@ -1899,7 +1940,7 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -1936,7 +1977,7 @@ class ParallelReduce, // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for CUDA for dynamic " @@ -2150,6 +2191,8 @@ class ParallelScan, Kokkos::Cuda> { } public: + Policy const& get_policy() const { return m_policy; } + //---------------------------------------- __device__ inline void operator()(void) const { @@ -2440,6 +2483,8 @@ class ParallelScanWithTotal, } public: + Policy const& get_policy() const { return m_policy; } + //---------------------------------------- __device__ inline void operator()(void) const { @@ -2799,5 +2844,5 @@ struct ParallelReduceFunctorType { } // namespace Kokkos -#endif /* defined( __CUDACC__ ) */ +#endif /* defined(KOKKOS_ENABLE_CUDA) */ #endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 6989431907..fc9fc3770b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -46,7 +46,7 @@ #define KOKKOS_CUDA_REDUCESCAN_HPP #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -983,5 +983,5 @@ inline unsigned cuda_single_inter_block_reduce_scan_shmem( //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #if defined( __CUDACC__ ) */ +#endif /* #if defined(KOKKOS_ENABLE_CUDA) */ #endif /* KOKKOS_CUDA_REDUCESCAN_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 6ead5197ee..2004edbeac 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -390,7 +390,7 @@ class TaskQueueSpecializationConstrained< ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32); ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32); -#if defined(KOKKOS_DEBUG) +#if defined(KOKKOS_ENABLE_DEBUG) KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr"); #endif @@ -799,7 +799,6 @@ namespace Kokkos { * i=0..N-1. * * The range i=0..N-1 is mapped to all threads of the the calling thread team. - * This functionality requires C++11 support. */ template KOKKOS_INLINE_FUNCTION void parallel_for( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 1160336519..4b472f5d4f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -50,7 +50,7 @@ #include /* only compile this file if CUDA is enabled for Kokkos */ -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -290,7 +290,7 @@ class CudaTeamMember { */ template KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); + return this->template team_scan(value, nullptr); } //---------------------------------------- @@ -935,6 +935,54 @@ KOKKOS_INLINE_FUNCTION //---------------------------------------------------------------------------- +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in HIP and largely the same as in OpenMPTarget +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +//---------------------------------------------------------------------------- + /** \brief Intra-thread vector parallel exclusive prefix sum. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) @@ -1089,6 +1137,6 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos -#endif /* defined( __CUDACC__ ) */ +#endif /* defined(KOKKOS_ENABLE_CUDA) */ #endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index b7c81b92f8..05876a9f02 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,6 +77,8 @@ class ParallelFor, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()() const noexcept { if (0 == (threadIdx.y % 16)) { // Spin until COMPLETED_TOKEN. diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp index f3cf25efef..c0daa274f8 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -48,7 +48,7 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -97,5 +97,5 @@ __device__ inline void cuda_abort(const char *const message) { } // namespace Kokkos #else void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {} -#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */ diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp index fea5a55f64..263ba97d73 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp @@ -45,6 +45,10 @@ #ifndef KOKKOS_HIP_ATOMIC_HPP #define KOKKOS_HIP_ATOMIC_HPP +#include +#include +#include + #if defined(KOKKOS_ENABLE_HIP_ATOMICS) namespace Kokkos { // HIP can do: @@ -103,19 +107,16 @@ atomic_exchange(volatile T *const dest, typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_exchange not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if (Impl::lock_address_hip_space((void*)dest)) - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -215,19 +216,16 @@ __inline__ __device__ T atomic_compare_exchange( typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_compare_exchange not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if (Impl::lock_address_hip_space((void*)dest)) - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; if (return_val == compare) *dest = val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -350,19 +348,16 @@ atomic_fetch_add(volatile T *dest, typename std::enable_if::type val) { - // FIXME_HIP - Kokkos::abort("atomic_fetch_add not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if(Kokkos::Impl::lock_address_hip_space((void *)dest)) - { + if (Kokkos::Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = return_val + val; - // Kokkos::Impl::unlock_address_hip_space((void *)dest); + Kokkos::Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -513,19 +508,16 @@ atomic_fetch_sub(volatile T *const dest, typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_fetch_sub not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - /*if (Impl::lock_address_hip_space((void*)dest)) */ - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = return_val - val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -569,6 +561,62 @@ __inline__ __device__ unsigned long long int atomic_fetch_and( unsigned long long int const val) { return atomicAnd(const_cast(dest), val); } + +namespace Impl { + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_relaxed_t) { + (void)atomic_exchange(ptr, val); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_seq_cst_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); + memory_fence(); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_release_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val) { + atomic_store(ptr, val, memory_order_relaxed); +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_relaxed_t) { + T dummy{}; + return atomic_compare_exchange(ptr, dummy, dummy); +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_seq_cst_t) { + memory_fence(); + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_acquire_t) { + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template +__inline__ __device__ T _atomic_load(T *ptr) { + return atomic_load(ptr, memory_order_relaxed); +} + +} // namespace Impl } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index fc4716d2a8..89135b6c45 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -55,6 +55,26 @@ namespace Kokkos { namespace Experimental { namespace Impl { + +template +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + // FIXME_HIP - currently the "constant" path is unimplemented. + // we should look at whether it's functional, and + // perform some simple scaling studies to see when / + // if the constant launcher outperforms the current + // pass by pointer shared launcher + HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, + hip_parallel_launch_local_memory, + blockSize, sharedmem)); +} + +template +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + hipOccupancy( + numBlocks, blockSize, sharedmem); +} template struct HIPGetMaxBlockSize; @@ -78,31 +98,26 @@ int hip_internal_get_block_size(const F &condition_check, const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM; const int max_threads_per_block = LaunchBounds::maxTperB == 0 - ? hip_instance->m_maxThreadsPerBlock + ? HIPTraits::MaxThreadsPerBlock : LaunchBounds::maxTperB; - const int regs_per_wavefront = attr.numRegs; + const int regs_per_wavefront = std::max(attr.numRegs, 1); const int regs_per_sm = hip_instance->m_regsPerSM; const int shmem_per_sm = hip_instance->m_shmemPerSM; const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock; const int max_blocks_per_sm = hip_instance->m_maxBlocksPerSM; const int max_threads_per_sm = hip_instance->m_maxThreadsPerSM; -// FIXME_HIP this is broken in 3.5, but should be in 3.6 -#if (HIP_VERSION_MAJOR > 3 || HIP_VERSION_MINOR > 5 || \ - HIP_VERSION_PATCH >= 20226) - int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block); -#else int block_size = max_threads_per_block; -#endif KOKKOS_ASSERT(block_size > 0); + const int blocks_per_warp = + (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize; int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize::value( f, block_size / vector_length); int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + functor_shmem + attr.sharedSizeBytes; - int max_blocks_regs = - regs_per_sm / (regs_per_wavefront * (block_size / HIPTraits::WarpSize)); + int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); int max_blocks_shmem = (total_shmem < max_shmem_per_block) ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) @@ -113,7 +128,8 @@ int hip_internal_get_block_size(const F &condition_check, blocks_per_sm = max_threads_per_sm / block_size; threads_per_sm = blocks_per_sm * block_size; } - int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0; + int opt_block_size = + (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; int opt_threads_per_sm = threads_per_sm; // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i // Achieved: %i %i Opt: %i %i\n",block_size, @@ -126,8 +142,7 @@ int hip_internal_get_block_size(const F &condition_check, f, block_size / vector_length); total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + functor_shmem + attr.sharedSizeBytes; - max_blocks_regs = - regs_per_sm / (regs_per_wavefront * (block_size / HIPTraits::WarpSize)); + max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); max_blocks_shmem = (total_shmem < max_shmem_per_block) ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) @@ -163,28 +178,21 @@ int hip_get_max_block_size(const HIPInternal *hip_instance, [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, shmem_block, shmem_thread); } -template -struct HIPGetMaxBlockSize, true> { +template +struct HIPGetMaxBlockSize { static int get_block_size(typename DriverType::functor_type const &f, size_t const vector_length, size_t const shmem_extra_block, size_t const shmem_extra_thread) { -// FIXME_HIP -- remove this once the API change becomes mature -#if !defined(__HIP__) - using blocktype = unsigned int; -#else - using blocktype = int; -#endif - blocktype numBlocks = 0; - int blockSize = 1024; + int numBlocks = 0; + int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB; int sharedmem = shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, blockSize, - sharedmem); + + hipOccupancy(&numBlocks, blockSize, sharedmem); if (numBlocks > 0) return blockSize; while (blockSize > HIPTraits::WarpSize && numBlocks == 0) { @@ -195,9 +203,7 @@ struct HIPGetMaxBlockSize, true> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); } int blockSizeUpperBound = blockSize * 2; while (blockSize < blockSizeUpperBound && numBlocks > 0) { @@ -208,9 +214,7 @@ struct HIPGetMaxBlockSize, true> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); } return blockSize - HIPTraits::WarpSize; } @@ -255,7 +259,7 @@ struct HIPGetOptBlockSize, true> { int maxOccupancy = 0; int bestBlockSize = 0; - while (blockSize < 1024) { + while (blockSize < HIPTraits::MaxThreadsPerBlock) { blockSize *= 2; // calculate the occupancy with that optBlockSize and check whether its @@ -265,9 +269,7 @@ struct HIPGetOptBlockSize, true> { ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); if (maxOccupancy < numBlocks * blockSize) { maxOccupancy = numBlocks * blockSize; bestBlockSize = blockSize; @@ -289,7 +291,7 @@ struct HIPGetOptBlockSize, false> { int maxOccupancy = 0; int bestBlockSize = 0; - while (blockSize < 1024) { + while (blockSize < HIPTraits::MaxThreadsPerBlock) { blockSize *= 2; sharedmem = shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + @@ -297,9 +299,7 @@ struct HIPGetOptBlockSize, false> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_local_memory, blockSize, - sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); if (maxOccupancy < numBlocks * blockSize) { maxOccupancy = numBlocks * blockSize; @@ -340,11 +340,8 @@ struct HIPGetOptBlockSize< ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, - hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy( + &numBlocks, blockSize, sharedmem); if (numBlocks >= static_cast(MinBlocksPerSM) && blockSize <= static_cast(MaxThreadsPerBlock)) { if (maxOccupancy < numBlocks * blockSize) { @@ -384,11 +381,8 @@ struct HIPGetOptBlockSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, - hip_parallel_launch_local_memory, - blockSize, sharedmem); + hipOccupancy( + &numBlocks, blockSize, sharedmem); if (numBlocks >= int(MinBlocksPerSM) && blockSize <= int(MaxThreadsPerBlock)) { if (maxOccupancy < numBlocks * blockSize) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp index 2abded0e99..b3480bcad0 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -56,10 +56,10 @@ namespace Kokkos { namespace Impl { void hip_internal_error_throw(hipError_t e, const char* name, - const char* file = NULL, const int line = 0); + const char* file = nullptr, const int line = 0); inline void hip_internal_safe_call(hipError_t e, const char* name, - const char* file = NULL, + const char* file = nullptr, const int line = 0) { if (hipSuccess != e) { hip_internal_error_throw(e, name, file, line); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 20af48bf6f..45512038ac 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -114,7 +114,7 @@ void HIPInternal::print_configuration(std::ostream &s) const { << (dev_info.m_hipProp[i].major) << "." << dev_info.m_hipProp[i].minor << ", Total Global Memory: " << ::Kokkos::Impl::human_memory_size(dev_info.m_hipProp[i].totalGlobalMem) - << ", Shared Memory per Wavefront: " + << ", Shared Memory per Block: " << ::Kokkos::Impl::human_memory_size( dev_info.m_hipProp[i].sharedMemPerBlock); if (m_hipDev == i) s << " : Selected"; @@ -140,10 +140,10 @@ HIPInternal::~HIPInternal() { m_maxShmemPerBlock = 0; m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; - m_scratchSpace = 0; - m_scratchFlags = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; m_scratchConcurrentBitset = nullptr; - m_stream = 0; + m_stream = nullptr; } int HIPInternal::verify_is_initialized(const char *const label) const { @@ -183,7 +183,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { const HIPInternalDevices &dev_info = HIPInternalDevices::singleton(); - const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags; + const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; // Need at least a GPU device const bool ok_id = @@ -195,9 +195,11 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_hipDev = hip_device_id; m_deviceProp = hipProp; - hipSetDevice(m_hipDev); + HIP_SAFE_CALL(hipSetDevice(m_hipDev)); - m_stream = stream; + m_stream = stream; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; // number of multiprocessors m_multiProcCount = hipProp.multiProcessorCount; @@ -216,14 +218,19 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_maxBlock = hipProp.maxGridSize[0]; // theoretically, we can get 40 WF's / CU, but only can sustain 32 + // see + // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 m_maxBlocksPerSM = 32; // FIXME_HIP - Nick to implement this upstream - m_regsPerSM = 262144 / 32; - m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - m_maxShmemPerBlock = hipProp.sharedMemPerBlock; - m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; - m_maxThreadsPerBlock = hipProp.maxThreadsPerBlock; - + // Register count comes from Sec. 2.2. "Data Sharing" of the + // Vega 7nm ISA document (see the diagram) + // https://developer.amd.com/wp-content/resources/Vega_7nm_Shader_ISA.pdf + // VGPRS = 4 (SIMD/CU) * 256 VGPR/SIMD * 64 registers / VGPR = + // 65536 VGPR/CU + m_regsPerSM = 65536; + m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; + m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -277,8 +284,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { } // Init the array for used for arbitrarily sized atomics - // FIXME_HIP uncomment this when global variable works - // if (m_stream == 0) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); + if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); } //---------------------------------------------------------------------------- @@ -327,18 +333,35 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( m_scratchFlags = reinterpret_cast(r->data()); - hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain); + HIP_SAFE_CALL( + hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); } return m_scratchFlags; } +void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, + bool force_shrink) { + if (m_team_scratch_current_size == 0) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_malloc( + "HIPSpace::ScratchMemory", m_team_scratch_current_size); + } + if ((bytes > m_team_scratch_current_size) || + ((bytes < m_team_scratch_current_size) && (force_shrink))) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_realloc( + m_team_scratch_ptr, m_team_scratch_current_size); + } + return m_team_scratch_ptr; +} + //---------------------------------------------------------------------------- void HIPInternal::finalize() { - HIP().fence(); + this->fence(); was_finalized = true; - if (0 != m_scratchSpace || 0 != m_scratchFlags) { + if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { using RecordHIP = Kokkos::Impl::SharedAllocationRecord; @@ -346,19 +369,24 @@ void HIPInternal::finalize() { RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); RecordHIP::decrement(RecordHIP::get_record(m_scratchConcurrentBitset)); - m_hipDev = -1; - m_hipArch = -1; - m_multiProcCount = 0; - m_maxWarpCount = 0; - m_maxBlock = 0; - m_maxSharedWords = 0; - m_maxShmemPerBlock = 0; - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchSpace = 0; - m_scratchFlags = 0; - m_scratchConcurrentBitset = nullptr; - m_stream = 0; + if (m_team_scratch_current_size > 0) + Kokkos::kokkos_free(m_team_scratch_ptr); + + m_hipDev = -1; + m_hipArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_maxShmemPerBlock = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 9688aef350..07ec8625e6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -57,6 +57,8 @@ struct HIPTraits { static int constexpr WarpSize = 64; static int constexpr WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static int constexpr WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ + static int constexpr MaxThreadsPerBlock = + 1024; // FIXME_HIP -- assumed constant for now static int constexpr ConstantMemoryUsage = 0x008000; /* 32k bytes */ static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */ @@ -92,9 +94,11 @@ class HIPInternal { int m_shmemPerSM; int m_maxShmemPerBlock; int m_maxThreadsPerSM; - int m_maxThreadsPerBlock; + + // Scratch Spaces for Reductions size_type m_scratchSpaceCount; size_type m_scratchFlagsCount; + size_type *m_scratchSpace; size_type *m_scratchFlags; uint32_t *m_scratchConcurrentBitset = nullptr; @@ -103,6 +107,10 @@ class HIPInternal { hipStream_t m_stream; + // Team Scratch Level 1 Space + mutable int64_t m_team_scratch_current_size; + mutable void *m_team_scratch_ptr; + bool was_finalized = false; static HIPInternal &singleton(); @@ -113,7 +121,7 @@ class HIPInternal { return m_hipDev >= 0; } // 0 != m_scratchSpace && 0 != m_scratchFlags ; } - void initialize(int hip_device_id, hipStream_t stream = 0); + void initialize(int hip_device_id, hipStream_t stream = nullptr); void finalize(); void print_configuration(std::ostream &) const; @@ -132,15 +140,21 @@ class HIPInternal { m_shmemPerSM(0), m_maxShmemPerBlock(0), m_maxThreadsPerSM(0), - m_maxThreadsPerBlock(0), m_scratchSpaceCount(0), m_scratchFlagsCount(0), - m_scratchSpace(0), - m_scratchFlags(0), - m_stream(0) {} + m_scratchSpace(nullptr), + m_scratchFlags(nullptr), + m_stream(nullptr), + m_team_scratch_current_size(0), + m_team_scratch_ptr(nullptr) {} + // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); size_type *scratch_flags(const size_type size); + + // Resizing of team level 1 scratch + void *resize_team_scratch_space(std::int64_t bytes, + bool force_shrink = false); }; } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 34ccd899c3..3e972c7346 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -64,7 +64,7 @@ namespace Kokkos { namespace Experimental { template inline __device__ T *kokkos_impl_hip_shared_memory() { - extern __shared__ HIPSpace::size_type sh[]; + HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh); return (T *)sh; } } // namespace Experimental @@ -74,18 +74,17 @@ namespace Kokkos { namespace Experimental { namespace Impl { -void *hip_resize_scratch_space(std::int64_t bytes, bool force_shrink = false); - template __global__ static void hip_parallel_launch_constant_memory() { -// cannot use global constants in HCC -#ifdef __HCC__ - __device__ __constant__ unsigned long kokkos_impl_hip_constant_memory_buffer - [Kokkos::Experimental::Impl::HIPTraits::ConstantMemoryUsage / - sizeof(unsigned long)]; -#endif + const DriverType &driver = *(reinterpret_cast( + kokkos_impl_hip_constant_memory_buffer)); + driver(); +} - const DriverType *const driver = (reinterpret_cast( +template +__global__ __launch_bounds__( + maxTperB, minBperSM) static void hip_parallel_launch_constant_memory() { + const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); driver->operator()(); @@ -147,6 +146,8 @@ struct HIPParallelLaunch< "HIPParallelLaunch FAILED: shared memory request is too large"); } + KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + // FIXME_HIP -- there is currently an error copying (some) structs // by value to the device in HIP-Clang / VDI // As a workaround, we can malloc the DriverType and explictly copy over. @@ -169,12 +170,15 @@ struct HIPParallelLaunch< } static hipFuncAttributes get_hip_func_attributes() { - hipFuncAttributes attr; - hipFuncGetAttributes( - &attr, - reinterpret_cast( - hip_parallel_launch_local_memory)); + static hipFuncAttributes attr = []() { + hipFuncAttributes attr; + HIP_SAFE_CALL(hipFuncGetAttributes( + &attr, + reinterpret_cast( + hip_parallel_launch_local_memory))); + return attr; + }(); return attr; } }; @@ -192,6 +196,8 @@ struct HIPParallelLaunch, "HIPParallelLaunch FAILED: shared memory request is too large")); } + KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + // Invoke the driver function on the device // FIXME_HIP -- see note about struct copy by value above @@ -212,10 +218,13 @@ struct HIPParallelLaunch, } static hipFuncAttributes get_hip_func_attributes() { - hipFuncAttributes attr; - hipFuncGetAttributes( - &attr, reinterpret_cast( - &hip_parallel_launch_local_memory)); + static hipFuncAttributes attr = []() { + hipFuncAttributes attr; + HIP_SAFE_CALL(hipFuncGetAttributes( + &attr, reinterpret_cast( + hip_parallel_launch_local_memory))); + return attr; + }(); return attr; } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp index 3426caafda..4f5271b6f6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -52,26 +52,28 @@ #include +namespace Kokkos { + #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +namespace Impl { __device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, nullptr, 0}; +} #endif -namespace Kokkos { - namespace { __global__ void init_lock_array_kernel_atomic() { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1) { - g_device_hip_lock_arrays.atomic[i] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.atomic[i] = 0; } } __global__ void init_lock_array_kernel_threadid(int N) { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < static_cast(N)) { - g_device_hip_lock_arrays.scratch[i] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.scratch[i] = 0; } } @@ -94,17 +96,17 @@ void initialize_host_hip_lock_arrays() { KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); init_lock_array_kernel_atomic<<< - (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, 0>>>(); + (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>(); init_lock_array_kernel_threadid<<< - (::Kokkos::Experimental::HIP::concurrency() + 255) / 256, 256, 0, 0>>>( - ::Kokkos::Experimental::HIP::concurrency()); + (::Kokkos::Experimental::HIP::concurrency() + 255) / 256, 256, 0, + nullptr>>>(::Kokkos::Experimental::HIP::concurrency()); } void finalize_host_hip_lock_arrays() { if (g_host_hip_lock_arrays.atomic == nullptr) return; - hipFree(g_host_hip_lock_arrays.atomic); + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); g_host_hip_lock_arrays.atomic = nullptr; - hipFree(g_host_hip_lock_arrays.scratch); + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch)); g_host_hip_lock_arrays.scratch = nullptr; g_host_hip_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp index fb6728ea14..f34f85f43b 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -51,7 +51,8 @@ #include -// FIXME_HIP We cannot use global variables defined in a namespace +namespace Kokkos { +namespace Impl { struct HIPLockArrays { std::int32_t* atomic; @@ -63,9 +64,6 @@ struct HIPLockArrays { /// of these arrays. extern HIPLockArrays g_host_hip_lock_arrays; -namespace Kokkos { -namespace Impl { - /// \brief After this call, the g_host_hip_lock_arrays variable has /// valid, initialized arrays. /// @@ -78,9 +76,6 @@ void initialize_host_hip_lock_arrays(); /// This call is idempotent. void finalize_host_hip_lock_arrays(); -} // namespace Impl -} // namespace Kokkos - #if defined(__HIPCC__) /// \brief This global variable in HIP space is what kernels use @@ -108,9 +103,6 @@ __device__ #define KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK 0x1FFFF -namespace Kokkos { -namespace Impl { - /// \brief Acquire a lock for the address /// /// This function tries to acquire the lock for the hash value derived @@ -152,14 +144,15 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } /* Dan Ibanez: it is critical that this code be a macro, so that it will capture the right address for g_device_hip_lock_arrays! putting this in an inline function will NOT do the right thing! */ -#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::Kokkos::Impl::lock_array_copied == 0) { \ - HIP_SAFE_CALL(hipMemcpyToSymbol(HIP_SYMBOL(g_device_hip_lock_arrays), \ - &g_host_hip_lock_arrays, \ - sizeof(HIPLockArrays))); \ - } \ - lock_array_copied = 1; \ +#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + HIP_SAFE_CALL(hipMemcpyToSymbol( \ + HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \ + &::Kokkos::Impl::g_host_hip_lock_arrays, \ + sizeof(::Kokkos::Impl::HIPLockArrays))); \ + } \ + ::Kokkos::Impl::lock_array_copied = 1; \ } #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index c3acc0622d..6b831ff7a3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -118,9 +118,9 @@ class ParallelFor, dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], m_policy.m_tile[2], m_policy.m_tile[3]); dim3 const grid( - std::min(static_cast(m_policy.m_tile_end[0] * - m_policy.m_tile_end[1]), - static_cast(maxblocks)), + std::min(static_cast(m_policy.m_tile_end[0] * + m_policy.m_tile_end[1]), + static_cast(maxblocks)), std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / block.y, maxblocks), @@ -168,8 +168,7 @@ class ParallelFor, *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - Kokkos::abort("Aborting"); + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); } } // end execute @@ -227,17 +226,6 @@ class ParallelReduce, ReducerType, using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, Policy, FunctorType, WorkTag, reference_type>; - // Shall we use the shfl based reduction or not (only use it for static sized - // types of more than 128bit - enum { - UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && - (ValueTraits::StaticValueSize != 0)) - }; - // Some crutch to do function overloading - private: - using DummyShflReductionType = double; - using DummySHMEMReductionType = int; - public: inline __device__ void exec_range(reference_type update) const { DeviceIteratePattern(m_policy, m_functor, update).exec_range(); @@ -299,7 +287,8 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: // This is copy/paste from Kokkos_HIP_Parallel_Range inline unsigned local_block_size(const FunctorType& f) { - unsigned n = Experimental::Impl::HIPTraits::WarpSize * 8; + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); while ( @@ -343,13 +332,13 @@ class ParallelReduce, ReducerType, // REQUIRED ( 1 , N , 1 ) const dim3 block(1, block_size, 1); // Required grid.x <= block.y - const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); + const dim3 grid(std::min(static_cast(block.y), + static_cast(nwork)), + 1, 1); const int shmem = - UseShflReduction - ? 0 - : ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< - false, FunctorType, WorkTag>(m_functor, block.y); + ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< + false, FunctorType, WorkTag>(m_functor, block.y); Kokkos::Experimental::Impl::HIPParallelLaunch( @@ -358,7 +347,7 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - Experimental::HIP().fence(); + m_policy.space().fence(); if (m_result_ptr) { const int size = ValueTraits::value_size( @@ -379,7 +368,7 @@ class ParallelReduce, ReducerType, ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result, typename std::enable_if::value, - void*>::type = NULL) + void*>::type = nullptr) : m_functor(arg_functor), m_policy(arg_policy), m_reducer(InvalidType()), @@ -387,8 +376,8 @@ class ParallelReduce, ReducerType, m_result_ptr_device_accessible( MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0) {} + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) @@ -400,8 +389,8 @@ class ParallelReduce, ReducerType, MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0) {} + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} }; } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 6e75e1857f..5607f1c91a 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -108,7 +108,11 @@ class ParallelFor, inline void execute() const { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - const int block_size = 256; // FIXME_HIP Choose block_size better + const int block_size = + LaunchBounds::maxTperB + ? LaunchBounds::maxTperB + : ::Kokkos::Experimental::Impl::HIPTraits:: + MaxThreadsPerBlock; // FIXME_HIP Choose block_size better const dim3 block(1, block_size, 1); const dim3 grid( typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); @@ -321,8 +325,8 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { - // FIXME_HIP I don't know where 8 comes from - unsigned int n = ::Kokkos::Experimental::Impl::HIPTraits::WarpSize * 8; + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = hip_single_inter_block_reduce_scan_shmem( f, n); @@ -406,7 +410,7 @@ class ParallelReduce, ReducerType, ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result, typename std::enable_if::value, - void*>::type = NULL) + void*>::type = nullptr) : m_functor(arg_functor), m_policy(arg_policy), m_reducer(InvalidType()), diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 56b07f6710..5da83d289e 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -77,6 +77,8 @@ class TeamPolicyInternal int m_team_scratch_size[2]; int m_thread_scratch_size[2]; int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; public: using execution_space = Kokkos::Experimental::HIP; @@ -92,6 +94,8 @@ class TeamPolicyInternal m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; } template @@ -104,10 +108,10 @@ class TeamPolicyInternal int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size< FunctorType, typename traits::launch_bounds>( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -144,10 +148,10 @@ class TeamPolicyInternal int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size< FunctorType, typename traits::launch_bounds>( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -173,7 +177,8 @@ class TeamPolicyInternal ReducerType>; return internal_team_size_recommended(f); } - + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } static int vector_length_max() { return ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; } @@ -203,8 +208,10 @@ class TeamPolicyInternal level == 0 ? 1024 * 40 : // FIXME_HIP arbitrarily setting this to 48kB 20 * 1024 * 1024); // FIXME_HIP arbitrarily setting this to 20MB } - - int vector_length() const { return m_vector_length; } + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } int team_size() const { return m_team_size; } @@ -231,7 +238,9 @@ class TeamPolicyInternal m_vector_length(0), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) {} + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_, int league_size_, @@ -239,11 +248,16 @@ class TeamPolicyInternal : m_space(space_), m_league_size(league_size_), m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible if (league_size_ >= static_cast( ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) @@ -251,7 +265,7 @@ class TeamPolicyInternal "Requested too large league_size for TeamPolicy on HIP execution " "space."); - // Make sure total block size is permissable + // Make sure total block size is permissible if (m_team_size * m_vector_length > 1024) { Impl::throw_runtime_exception( std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " @@ -263,65 +277,56 @@ class TeamPolicyInternal TeamPolicyInternal(const execution_space space_, int league_size_, const Kokkos::AUTO_t& /* team_size_request */, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - } + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} TeamPolicyInternal(int league_size_, int team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissable - if (m_team_size * m_vector_length > 1024) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& /* team_size_request */, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} int chunk_size() const { return m_chunk_size; } @@ -370,7 +375,7 @@ class TeamPolicyInternal typename traits::launch_bounds>::get_hip_func_attributes(); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double) + ((functor_value_traits::StaticValueSize != 0) @@ -382,7 +387,7 @@ class TeamPolicyInternal int p2 = 1; while (p2 <= block_size) p2 *= 2; p2 /= 2; - return p2 / vector_length(); + return p2 / impl_vector_length(); } template @@ -400,12 +405,6 @@ class TeamPolicyInternal } }; -struct HIPLockArrays { - std::int32_t* atomic = nullptr; - std::int32_t* scratch = nullptr; - std::int32_t n = 0; -}; - template class ParallelFor, Kokkos::Experimental::HIP> { @@ -434,7 +433,6 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; - mutable HIPLockArrays hip_lock_arrays; template __device__ inline @@ -458,15 +456,19 @@ class ParallelFor, __shared__ int64_t base_thread_id; if (threadIdx.x == 0 && threadIdx.y == 0) { threadid = (blockIdx.x * blockDim.z + threadIdx.z) % - (hip_lock_arrays.n / (blockDim.x * blockDim.y)); + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); threadid *= blockDim.x * blockDim.y; int done = 0; while (!done) { - done = (0 == atomicCAS(&hip_lock_arrays.scratch[threadid], 0, 1)); + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); if (!done) { threadid += blockDim.x * blockDim.y; if (int64_t(threadid + blockDim.x * blockDim.y) >= - int64_t(hip_lock_arrays.n)) + int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) threadid = 0; } } @@ -490,22 +492,11 @@ class ParallelFor, if (m_scratch_size[1] > 0) { __syncthreads(); if (threadIdx.x == 0 && threadIdx.y == 0) - hip_lock_arrays.scratch[threadid] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; } } inline void execute() const { - HIP_SAFE_CALL(hipMalloc( - &hip_lock_arrays.atomic, - sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); - HIP_SAFE_CALL(hipMalloc( - &hip_lock_arrays.scratch, - sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); - HIP_SAFE_CALL(hipMemset( - hip_lock_arrays.scratch, 0, - sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); - hip_lock_arrays.n = ::Kokkos::Experimental::HIP::concurrency(); - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; dim3 const grid(static_cast(m_league_size), 1, 1); dim3 const block(static_cast(m_vector_size), @@ -515,16 +506,6 @@ class ParallelFor, *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute - - if (hip_lock_arrays.atomic) { - HIP_SAFE_CALL(hipFree(hip_lock_arrays.atomic)); - hip_lock_arrays.atomic = nullptr; - } - if (hip_lock_arrays.scratch) { - HIP_SAFE_CALL(hipFree(hip_lock_arrays.scratch)); - hip_lock_arrays.scratch = nullptr; - } - hip_lock_arrays.n = 0; } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -532,7 +513,7 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelFor, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -558,11 +539,13 @@ class ParallelFor, m_scratch_ptr[1] = m_team_size <= 0 ? nullptr - : ::Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast( - ::Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size))); + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + static_cast( + ::Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); int const shmem_size_total = m_shmem_begin + m_shmem_size; if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < @@ -580,10 +563,10 @@ class ParallelFor, ::Kokkos::Experimental::Impl::hip_get_max_block_size( m_policy.space().impl_internal_space_instance(), attr, - arg_functor, arg_policy.vector_length(), + arg_functor, arg_policy.impl_vector_length(), arg_policy.team_scratch_size(0), arg_policy.thread_scratch_size(0)) / - arg_policy.vector_length())) { + arg_policy.impl_vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); } @@ -630,8 +613,8 @@ class ParallelReduce, static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0); private: - using DummyShflReductionType = double; - using DummySHMEMReductionType = int; + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; // Algorithmic constraints: blockDim.y is a power of two AND // blockDim.y == blockDim.z == 1 shared memory utilization: @@ -672,60 +655,8 @@ class ParallelReduce, m_functor(TagType(), member, update); } - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - __shared__ int64_t base_thread_id; - // FIXME_HIP This uses g_device_hip_lock_arrays which is not working - if (threadIdx.x == 0 && threadIdx.y == 0) { - Impl::hip_abort("Error should not be here (not implemented yet)\n"); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % - (g_device_hip_lock_arrays.n / (blockDim.x * blockDim.y)); - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == - atomicCAS(&g_device_hip_lock_arrays.scratch[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (static_cast(threadid + blockDim.x * blockDim.y) >= - static_cast(g_device_hip_lock_arrays.n)) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - } - - run(Kokkos::Impl::if_c::select(1, 1.0), - threadid); - if (m_scratch_size[1] > 0) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - Impl::hip_abort("Error should not be here (not implemented yet)\n"); - g_device_hip_lock_arrays.scratch[threadid] = 0; - } - } - } - - __device__ inline void run(DummySHMEMReductionType const&, - int const& threadid) const { - integral_nonzero_constant const - word_count(value_traits::value_size( - reducer_conditional::select(m_functor, m_reducer)) / - sizeof(size_type)); - - reference_type value = value_init::init( - reducer_conditional::select(m_functor, m_reducer), - Kokkos::Experimental::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - - // Iterate this block through the league + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { int const int_league_size = static_cast(m_league_size); for (int league_rank = blockIdx.x; league_rank < int_league_size; league_rank += gridDim.x) { @@ -741,6 +672,63 @@ class ParallelReduce, m_scratch_size[1], league_rank, m_league_size), value); } + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (static_cast(threadid + blockDim.x * blockDim.y) >= + static_cast(Kokkos::Impl::g_device_hip_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; + } + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + integral_nonzero_constant const + word_count(value_traits::value_size( + reducer_conditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + reference_type value = value_init::init( + reducer_conditional::select(m_functor, m_reducer), + Kokkos::Experimental::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + + // Iterate this block through the league + iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); @@ -777,28 +765,12 @@ class ParallelReduce, } } - __device__ inline void run(DummyShflReductionType const&, - int const& threadid) const { - // FIXME_HIP implementation close to the function above + __device__ inline void run(ShflReductionTag, int const threadid) const { value_type value; value_init::init(reducer_conditional::select(m_functor, m_reducer), &value); // Iterate this block through the league - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - Kokkos::Experimental::kokkos_impl_hip_shared_memory() + - m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } + iterate_through_league(threadid, value); pointer_type const result = m_result_ptr_device_accessible @@ -807,7 +779,7 @@ class ParallelReduce, value_type init; value_init::init(reducer_conditional::select(m_functor, m_reducer), &init); - if (int_league_size == 0) { + if (m_league_size == 0) { Kokkos::Impl::FunctorFinal::final( reducer_conditional::select(m_functor, m_reducer), reinterpret_cast(&value)); @@ -897,15 +869,15 @@ class ParallelReduce, m_result_ptr_host_accessible( MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), m_team_begin(0), m_shmem_begin(0), m_shmem_size(0), m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -918,17 +890,6 @@ class ParallelReduce, m_policy.thread_scratch_size(0)) / m_vector_size; - // We can't early exit here because the result place might not be accessible - // or the functor/reducer init not callable on the host. But I am not sure - // all the other code below is kosher with zero work length ... - // - // Return Init value if the number of worksets is zero - // if (m_league_size * m_team_size == 0) { - // value_init::init(reducer_conditional::select(m_functor, m_reducer), - // arg_result.data()); - // return; - //} - m_team_begin = UseShflReduction ? 0 @@ -942,16 +903,19 @@ class ParallelReduce, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_ptr[1] = - m_team_size <= 0 ? nullptr - : Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - (static_cast( - Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size)))); + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + (static_cast( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size)))); // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for HIP for dynamic " @@ -1004,15 +968,15 @@ class ParallelReduce, MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), m_team_begin(0), m_shmem_begin(0), m_shmem_size(0), m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -1025,17 +989,6 @@ class ParallelReduce, m_policy.thread_scratch_size(0)) / m_vector_size; - // We can't early exit here because the result place might not be accessible - // or the functor/reducer init not callable on the host. But I am not sure - // all the other code below is kosher with zero work length ... - // - // Return Init value if the number of worksets is zero - // if (arg_policy.league_size() == 0) { - // value_init::init(reducer_conditional::select(m_functor, m_reducer), - // m_result_ptr); - // return; - //} - m_team_begin = UseShflReduction ? 0 @@ -1049,16 +1002,19 @@ class ParallelReduce, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_ptr[1] = - m_team_size <= 0 ? nullptr - : Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast( - Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size))); + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + static_cast( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for HIP for dynamic " diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index cdf9cac30d..fe7c34bb80 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -81,11 +81,6 @@ __device__ inline void hip_intra_warp_shuffle_reduction( join(result, tmp); } shift *= 2; - // Not sure why there is a race condition here but we need to wait for the - // join operation to be finished to perform the next shuffle. Note that the - // problem was also found in the CUDA backend with CUDA clang - // (https://github.com/kokkos/kokkos/issues/941) - __syncthreads(); } // Broadcast the result to all the threads in the warp @@ -204,7 +199,6 @@ __device__ inline bool hip_inter_block_shuffle_reduction( value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size); if (id + i < gridDim.x) join(value, tmp); } - __syncthreads(); } } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index a97fb2f7cc..00cef28f82 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -64,8 +64,8 @@ namespace Impl { namespace { hipStream_t get_deep_copy_stream() { - static hipStream_t s = 0; - if (s == 0) { + static hipStream_t s = nullptr; + if (s == nullptr) { HIP_SAFE_CALL(hipStreamCreate(&s)); } return s; @@ -161,7 +161,7 @@ DeepCopy 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; @@ -233,13 +238,19 @@ void* HIPHostPinnedSpace::allocate(const size_t arg_alloc_size) const { void* HIPHostPinnedSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void* HIPHostPinnedSpace::impl_allocate( + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void* ptr = nullptr; auto const error_code = hipHostMalloc(&ptr, arg_alloc_size); if (error_code != hipSuccess) { - hipGetLastError(); // This is the only way to clear the last error, which - // we should do here since we're turning it into an - // exception here + // This is the only way to clear the last error, which we should do here + // since we're turning it into an exception here + (void)hipGetLastError(); throw HIPRawMemoryAllocationFailure( arg_alloc_size, error_code, RawMemoryAllocationFailure::AllocationMechanism::HIPHostMalloc); @@ -247,9 +258,7 @@ void* HIPHostPinnedSpace::allocate(const char* arg_label, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; @@ -261,12 +270,17 @@ void HIPSpace::deallocate(void* const arg_alloc_ptr, void HIPSpace::deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); } @@ -280,12 +294,17 @@ void HIPHostPinnedSpace::deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPHostPinnedSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr)); } @@ -299,7 +318,7 @@ void HIPHostPinnedSpace::deallocate(const char* arg_label, namespace Kokkos { namespace Impl { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord SharedAllocationRecord::s_root_record; @@ -375,7 +394,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -407,7 +426,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -445,7 +464,7 @@ void* SharedAllocationRecord:: void SharedAllocationRecord::deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != 0) { + if (arg_alloc_ptr != nullptr) { SharedAllocationRecord* const r = get_record(arg_alloc_ptr); RecordBase::decrement(r); @@ -521,7 +540,7 @@ SharedAllocationRecord::get_record( Header head; Header const* const head_hip = - alloc_ptr ? Header::get_header(alloc_ptr) : (Header*)0; + alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; if (alloc_ptr) { Kokkos::Impl::DeepCopy( @@ -529,7 +548,7 @@ SharedAllocationRecord::get_record( } RecordHIP* const record = - alloc_ptr ? static_cast(head.m_record) : (RecordHIP*)0; + alloc_ptr ? static_cast(head.m_record) : nullptr; if (!alloc_ptr || record->m_alloc_ptr != head_hip) { Kokkos::Impl::throw_runtime_exception(std::string( @@ -561,9 +580,9 @@ SharedAllocationRecord:: - print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace& space, + print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&, bool detail) { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord* r = &s_root_record; char buffer[256]; @@ -598,7 +617,7 @@ void SharedAllocationRecord:: reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, r->m_count, reinterpret_cast(r->m_dealloc), head.m_label); - std::cout << buffer; + s << buffer; r = r->m_next; } while (r != &s_root_record); } else { @@ -622,51 +641,28 @@ void SharedAllocationRecord:: } else { snprintf(buffer, 256, "HIP [ 0 + 0 ]\n"); } - std::cout << buffer; + s << buffer; r = r->m_next; } while (r != &s_root_record); } #else (void)s; - (void)space; (void)detail; throw_runtime_exception( "Kokkos::Impl::SharedAllocationRecord::print_records" - " only works with KOKKOS_DEBUG enabled"); + " only works with KOKKOS_ENABLE_DEBUG enabled"); #endif } -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -void* hip_resize_scratch_space(size_t bytes, bool force_shrink) { - static void* ptr = NULL; - static size_t current_size = 0; - if (current_size == 0) { - current_size = bytes; - ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", current_size); - } - if (bytes > current_size) { - current_size = bytes; - ptr = Kokkos::kokkos_realloc(ptr, - current_size); - } - if ((bytes < current_size) && (force_shrink)) { - current_size = bytes; - Kokkos::kokkos_free(ptr); - ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", current_size); - } - return ptr; -} - } // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { +namespace Impl { +int get_gpu(const InitArguments& args); +} namespace Experimental { int HIP::concurrency() { @@ -760,4 +756,57 @@ hipDeviceProp_t const& HIP::hip_device_prop() { const char* HIP::name() { return "HIP"; } } // namespace Experimental + +namespace Impl { + +int g_hip_space_factory_initialized = + initialize_space_factory("150_HIP"); + +void HIPSpaceInitializer::initialize(const InitArguments& args) { + int use_gpu = Impl::get_gpu(args); + + if (std::is_same::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Experimental::HIP::impl_initialize( + Kokkos::Experimental::HIP::SelectDevice(use_gpu)); + } else { + Kokkos::Experimental::HIP::impl_initialize(); + } + } +} + +void HIPSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same::value || + all_spaces) { + if (Kokkos::Experimental::HIP::impl_is_initialized()) + Kokkos::Experimental::HIP::impl_finalize(); + } +} + +void HIPSpaceInitializer::fence() { + Kokkos::Experimental::HIP::impl_static_fence(); +} + +void HIPSpaceInitializer::print_configuration(std::ostream& msg, + const bool detail) { + msg << "Devices:" << std::endl; + msg << " KOKKOS_ENABLE_HIP: "; + msg << "yes" << std::endl; + + msg << "HIP Options:" << std::endl; + msg << " KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nRuntime Configuration:" << std::endl; + Experimental::HIP::print_configuration(msg, detail); +} + +} // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index 577c392a0a..7571510c31 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -270,7 +270,7 @@ class HIPTeamMember { */ template KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); + return this->template team_scan(value, nullptr); } //---------------------------------------- @@ -755,6 +755,52 @@ KOKKOS_INLINE_FUNCTION #endif } +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in CUDA and largely the same as in OpenMPTarget +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + template KOKKOS_INLINE_FUNCTION void parallel_for( const Impl::TeamVectorRangeBoundariesStruct& diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 045892bb99..c5ca89a9fd 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -128,7 +128,13 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl(val, lane, width); + __threadfence(); + return return_val; } }; @@ -141,7 +147,13 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl_up(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_up(val, lane, width); + __threadfence(); + return return_val; } }; @@ -155,7 +167,13 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl_down(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_down(val, lane, width); + __threadfence(); + return return_val; } }; diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index c7512ff35b..910d5e52e6 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -42,7 +42,7 @@ //@HEADER */ -#include +#include #ifdef KOKKOS_ENABLE_HPX #include @@ -79,7 +79,7 @@ void HPX::impl_initialize(int thread_count) { if (rt == nullptr) { std::vector config = { "hpx.os_threads=" + std::to_string(thread_count), -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG "--hpx:attach-debugger=exception", #endif }; @@ -110,7 +110,7 @@ void HPX::impl_initialize() { hpx::runtime *rt = hpx::get_runtime_ptr(); if (rt == nullptr) { std::vector config = { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG "--hpx:attach-debugger=exception", #endif }; @@ -153,6 +153,56 @@ void HPX::impl_finalize() { } } // namespace Experimental + +namespace Impl { + +int g_hpx_space_factory_initialized = + initialize_space_factory("060_HPX"); + +void HPXSpaceInitializer::initialize(const InitArguments &args) { + const int num_threads = args.num_threads; + + if (std::is_same::value || + std::is_same::value) { + if (num_threads > 0) { + Kokkos::Experimental::HPX::impl_initialize(num_threads); + } else { + Kokkos::Experimental::HPX::impl_initialize(); + } + // std::cout << "Kokkos::initialize() fyi: HPX enabled and initialized" << + // std::endl ; + } else { + // std::cout << "Kokkos::initialize() fyi: HPX enabled but not initialized" + // << std::endl ; + } +} + +void HPXSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same::value || + std::is_same::value || + all_spaces) { + if (Kokkos::Experimental::HPX::impl_is_initialized()) + Kokkos::Experimental::HPX::impl_finalize(); + } +} + +void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); } + +void HPXSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "HPX Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_HPX: "; + msg << "yes" << std::endl; + + msg << "\nHPX Runtime Configuration:" << std::endl; + Kokkos::Experimental::HPX::print_configuration(msg, detail); +} + +} // namespace Impl } // namespace Kokkos #else diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index d3ec64368f..140376425c 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -52,19 +52,11 @@ #include #include #include +#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) -#include -#include -#endif - -#if defined(__HCC__) && defined(KOKKOS_ENABLE_ROCM) -//#include -#include -#endif - -#if defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP) -#include +#if defined(KOKKOS_ENABLE_CUDA) || \ + (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)) +#include #endif namespace Kokkos { @@ -83,8 +75,7 @@ enum class Iterate template struct default_outer_direction { using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) static constexpr Iterate value = Iterate::Left; #else static constexpr Iterate value = Iterate::Right; @@ -94,8 +85,7 @@ struct default_outer_direction { template struct default_inner_direction { using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) static constexpr Iterate value = Iterate::Left; #else static constexpr Iterate value = Iterate::Right; @@ -118,6 +108,79 @@ struct Rank { static constexpr Iterate inner_direction = InnerDir; }; +namespace Impl { +// NOTE the comparison below is encapsulated to silent warnings about pointless +// comparison of unsigned integer with zero +template +constexpr std::enable_if_t::value, bool> +is_less_than_value_initialized_variable(T) { + return false; +} + +template +constexpr std::enable_if_t::value, bool> +is_less_than_value_initialized_variable(T arg) { + return arg < T{}; +} + +// Checked narrowing conversion that calls abort if the cast changes the value +template +constexpr To checked_narrow_cast(From arg) { + constexpr const bool is_different_signedness = + (std::is_signed::value != std::is_signed::value); + auto const ret = static_cast(arg); + if (static_cast(ret) != arg || + (is_different_signedness && + is_less_than_value_initialized_variable(arg) != + is_less_than_value_initialized_variable(ret))) { + Kokkos::abort("unsafe narrowing conversion"); + } + return ret; +} +// NOTE prefer C array U[M] to std::initalizer_list so that the number of +// elements can be deduced (https://stackoverflow.com/q/40241370) +// NOTE for some unfortunate reason the policy bounds are stored as signed +// integer arrays (point_type which is Kokkos::Array) so we +// specify the index type (actual policy index_type from the traits) and check +// ahead of time that narrowing conversions will be safe. +template +constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { + using T = typename Array::value_type; + Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + auto* ptr = a.data(); + // NOTE equivalent to + // std::transform(std::begin(init), std::end(init), a.data(), + // [](U x) { return static_cast(x); }); + // except that std::transform is not constexpr. + for (auto x : init) { + *ptr++ = checked_narrow_cast(x); + (void)checked_narrow_cast(x); // see note above + } + return a; +} + +// NOTE Making a copy even when std::is_same>::value +// is true to reduce code complexity. You may change this if you have a good +// reason to. Intentionally not enabling std::array at this time but this may +// change too. +template +constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( + Kokkos::Array const& other) { + using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; + NVCC_WONT_LET_ME_CALL_YOU_Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + for (std::size_t i = 0; i < M; ++i) { + a[i] = checked_narrow_cast(other[i]); + (void)checked_narrow_cast(other[i]); // see note above + } + return a; +} +} // namespace Impl + // multi-dimensional iteration pattern template struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { @@ -148,7 +211,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { enum { rank = static_cast(iteration_pattern::rank) }; using index_type = typename traits::index_type; - using array_index_type = long; + using array_index_type = std::int64_t; using point_type = Kokkos::Array; // was index_type using tile_type = Kokkos::Array; // If point_type or tile_type is not templated on a signed integral type (if @@ -162,12 +225,12 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { // as template parameter to the MDRangePolicy or static_cast the individual // values - point_type m_lower; - point_type m_upper; - tile_type m_tile; - point_type m_tile_end; - index_type m_num_tiles; - index_type m_prod_tile_dims; + point_type m_lower = {}; + point_type m_upper = {}; + tile_type m_tile = {}; + point_type m_tile_end = {}; + index_type m_num_tiles = 1; + index_type m_prod_tile_dims = 1; /* // NDE enum impl definition alternative - replace static constexpr int ? @@ -203,49 +266,89 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { return m_space; } - template - MDRangePolicy(std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) - : m_space() { - init(lower, upper, tile); + + MDRangePolicy() = default; + + template ::value && + std::is_integral::value && + std::is_integral::value>> + MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); } - template + template ::value && + std::is_integral::value && + std::is_integral::value>> MDRangePolicy(const typename traits::execution_space& work_space, - std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) - : m_space(work_space) { - init(lower, upper, tile); + const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); } + // NOTE: Keeping these two constructor despite the templated constructors + // from Kokkos arrays for backwards compability to allow construction from + // double-braced initializer lists. MDRangePolicy(point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) - : m_space(), - m_lower(lower), - m_upper(upper), - m_tile(tile), - m_num_tiles(1), - m_prod_tile_dims(1) { - init(); - } + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} MDRangePolicy(const typename traits::execution_space& work_space, point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) - : m_space(work_space), - m_lower(lower), - m_upper(upper), - m_tile(tile), - m_num_tiles(1), - m_prod_tile_dims(1) { + : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { init(); } + template ::value>> + MDRangePolicy(Kokkos::Array const& lower, + Kokkos::Array const& upper, + Kokkos::Array const& tile = Kokkos::Array{}) + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} + + template ::value>> + MDRangePolicy(const typename traits::execution_space& work_space, + Kokkos::Array const& lower, + Kokkos::Array const& upper, + Kokkos::Array const& tile = Kokkos::Array{}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) {} + template MDRangePolicy(const MDRangePolicy p) - : m_space(p.m_space), + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), m_lower(p.m_lower), m_upper(p.m_upper), m_tile(p.m_tile), @@ -260,165 +363,6 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { #if defined(KOKKOS_ENABLE_CUDA) && !std::is_same::value #endif -#if defined(KOKKOS_ENABLE_ROCM) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_HIP) - && !std::is_same::value -#endif - ) { - index_type span; - for (int i = 0; i < rank; ++i) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - m_tile[i] = 2; - } else { - m_tile[i] = (span == 0 ? 1 : span); - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - } -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - else // Cuda - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - bool is_cuda_exec_space = -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value; -#else - false; -#endif - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda and HIP - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = (is_cuda_exec_space) ? 2 : 4; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 - // max per dim (Kepler), but product num_threads < 1024 - if (is_cuda_exec_space) { - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - " Cuda ExecSpace Error: MDRange tile dims exceed maximum number " - "of " - "threads per block - choose smaller tile dims"); - } else { - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - } - } - } -#endif -#if defined(KOKKOS_ENABLE_ROCM) - else // ROCm - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for rocm - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = 4; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > 1024) { // but product num_threads < 1024 - printf(" Tile dimensions exceed ROCm limits\n"); - Kokkos::abort( - " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - // Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: - // MDRange tile dims exceed maximum number of threads per block - choose - // smaller tile dims"); - } - } -#endif - } - - template - void init(std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) { - if (static_cast(m_lower.size()) != rank || - static_cast(m_upper.size()) != rank) - Kokkos::abort( - "MDRangePolicy: Constructor initializer lists have wrong size"); - - for (auto i = 0; i < rank; ++i) { - m_lower[i] = static_cast(lower.begin()[i]); - m_upper[i] = static_cast(upper.begin()[i]); - if (static_cast(tile.size()) == rank) - m_tile[i] = static_cast(tile.begin()[i]); - else - m_tile[i] = 0; - } - - m_num_tiles = 1; - m_prod_tile_dims = 1; - - // Host - if (true -#if defined(KOKKOS_ENABLE_CUDA) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_ROCM) - && !std::is_same::value -#endif #if defined(KOKKOS_ENABLE_HIP) && !std::is_same::value @@ -453,15 +397,21 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { rank_start = rank - 1; rank_end = -1; } + bool is_cuda_exec_space = +#if defined(KOKKOS_ENABLE_CUDA) + std::is_same::value; +#else + false; +#endif for (int i = rank_start; i != rank_end; i += increment) { span = m_upper[i] - m_lower[i]; if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda + // TODO: determine what is a good default tile size for Cuda and HIP // may be rank dependent if (((int)inner_direction == (int)Right && (i < rank - 1)) || ((int)inner_direction == (int)Left && (i > 0))) { if (m_prod_tile_dims < 256) { - m_tile[i] = 2; + m_tile[i] = (is_cuda_exec_space) ? 2 : 4; } else { m_tile[i] = 1; } @@ -477,63 +427,17 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { if (m_prod_tile_dims > 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 // max per dim (Kepler), but product num_threads < 1024 -#if defined(KOKKOS_ENABLE_CUDA) - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); -#else - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - " HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); -#endif - } - } -#endif -#if defined(KOKKOS_ENABLE_ROCM) - else // ROCm - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = 2; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } + if (is_cuda_exec_space) { + printf(" Tile dimensions exceed Cuda limits\n"); + Kokkos::abort( + "Cuda ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); + } else { + printf(" Tile dimensions exceed HIP limits\n"); + Kokkos::abort( + "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " + "threads per block - choose smaller tile dims"); } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match ROCm restriction for ParallelReduce; 1024,1024,1024 - // max per dim , but product num_threads < 1024 - printf(" Tile dimensions exceed ROCm limits\n"); - Kokkos::abort( - " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - // Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: - // MDRange tile dims exceed maximum number of threads per block - choose - // smaller tile dims"); } } #endif @@ -550,28 +454,5 @@ using Kokkos::MDRangePolicy; using Kokkos::Rank; } // namespace Experimental } // namespace Kokkos -// ------------------------------------------------------------------ // - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -template -struct PolicyPropertyAdaptor, - MDRangePolicy> { - using policy_in_t = MDRangePolicy; - using policy_out_t = - MDRangePolicy>; -}; - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index d4632596c8..8e226a078d 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -85,23 +85,23 @@ namespace Impl { template struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 89f84ae7ce..8cd60fa6ba 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -73,22 +73,25 @@ #include //---------------------------------------------------------------------------- + +// Need to fix this for pure clang on windows #if defined(_WIN32) #define KOKKOS_ENABLE_WINDOWS_ATOMICS + #if defined(KOKKOS_ENABLE_CUDA) #define KOKKOS_ENABLE_CUDA_ATOMICS +#if defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_ENABLE_GNU_ATOMICS #endif -#else +#endif + +#else // _WIN32 #if defined(KOKKOS_ENABLE_CUDA) // Compiling NVIDIA device code, must use Cuda atomics: #define KOKKOS_ENABLE_CUDA_ATOMICS -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU) - -#define KOKKOS_ENABLE_ROCM_ATOMICS - #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) || \ defined(KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE) @@ -111,7 +114,7 @@ #define KOKKOS_ENABLE_SERIAL_ATOMICS #elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ - (defined(KOKKOS_COMPILER_NVCC)) + (defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_IBM)) #define KOKKOS_ENABLE_GNU_ATOMICS @@ -176,21 +179,11 @@ inline const char* atomic_query_version() { // Implements Strongly-typed analogs of C++ standard memory orders #include "impl/Kokkos_Atomic_Memory_Order.hpp" -#if defined(KOKKOS_ENABLE_ROCM) -namespace Kokkos { -namespace Impl { -extern KOKKOS_INLINE_FUNCTION bool lock_address_rocm_space(void* ptr); - -extern KOKKOS_INLINE_FUNCTION void unlock_address_rocm_space(void* ptr); -} // namespace Impl -} // namespace Kokkos -#include -#endif #if defined(KOKKOS_ENABLE_HIP) #include #endif -#ifdef _WIN32 +#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) #include "impl/Kokkos_Atomic_Windows.hpp" #endif //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 5303b85beb..fb2925a066 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -49,6 +49,10 @@ #include #include +#ifdef KOKKOS_ENABLE_SYCL +#include +#endif + namespace Kokkos { /// \class complex @@ -692,27 +696,60 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { //! Absolute value (magnitude) of a complex number. template KOKKOS_INLINE_FUNCTION RealType abs(const complex& x) { - return std::hypot(x.real(), x.imag()); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::hypot; +#else + using std::hypot; +#endif + return hypot(x.real(), x.imag()); } //! Power of a complex number template KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, const RealType& e) { - RealType r = abs(x); - RealType phi = std::atan(x.imag() / x.real()); - return std::pow(r, e) * - Kokkos::complex(std::cos(phi * e), std::sin(phi * e)); + RealType r = abs(x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan; + using cl::sycl::cos; + using cl::sycl::pow; + using cl::sycl::sin; +#else + using std::atan; + using std::cos; + using std::pow; + using std::sin; +#endif + RealType phi = atan(x.imag() / x.real()); + return pow(r, e) * Kokkos::complex(cos(phi * e), sin(phi * e)); } -//! Square root of a complex number. +//! Square root of a complex number. This is intended to match the stdc++ +//! implementation, which returns sqrt(z*z) = z; where z is complex number. template KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( const complex& x) { - RealType r = abs(x); - RealType phi = std::atan(x.imag() / x.real()); - return std::sqrt(r) * - Kokkos::complex(std::cos(phi * 0.5), std::sin(phi * 0.5)); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::abs; + using cl::sycl::sqrt; +#else + using std::abs; + using std::sqrt; +#endif + + RealType r = x.real(); + RealType i = x.imag(); + + if (r == RealType()) { + RealType t = sqrt(abs(i) / 2); + return Kokkos::complex(t, i < RealType() ? -t : t); + } else { + RealType t = sqrt(2 * (abs(x) + abs(r))); + RealType u = t / 2; + return r > RealType() + ? Kokkos::complex(u, i / t) + : Kokkos::complex(abs(i) / t, i < RealType() ? -u : u); + } } //! Conjugate of a complex number. @@ -725,8 +762,211 @@ KOKKOS_INLINE_FUNCTION complex conj( //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { - return std::exp(x.real()) * - complex(std::cos(x.imag()), std::sin(x.imag())); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::exp; + using cl::sycl::sin; +#else + using std::cos; + using std::exp; + using std::sin; +#endif + return exp(x.real()) * complex(cos(x.imag()), sin(x.imag())); +} + +//! natural log of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex log( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan; + using cl::sycl::log; +#else + using std::atan; + using std::log; +#endif + RealType phi = atan(x.imag() / x.real()); + return Kokkos::complex(log(abs(x)), phi); +} + +//! sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex sin( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(sin(x.real()) * cosh(x.imag()), + cos(x.real()) * sinh(x.imag())); +} + +//! cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex cos( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(cos(x.real()) * cosh(x.imag()), + -sin(x.real()) * sinh(x.imag())); +} + +//! tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex tan( + const complex& x) { + return sin(x) / cos(x); +} + +//! hyperbolic sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(sinh(x.real()) * cos(x.imag()), + cosh(x.real()) * sin(x.imag())); +} + +//! hyperbolic cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex cosh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(cosh(x.real()) * cos(x.imag()), + sinh(x.real()) * sin(x.imag())); +} + +//! hyperbolic tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex tanh( + const complex& x) { + return sinh(x) / cosh(x); +} + +//! inverse hyperbolic sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex asinh( + const complex& x) { + return log(x + sqrt(x * x + RealType(1.0))); +} + +//! inverse hyperbolic cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex acosh( + const complex& x) { + return RealType(2.0) * log(sqrt(RealType(0.5) * (x + RealType(1.0))) + + sqrt(RealType(0.5) * (x - RealType(1.0)))); +} + +//! inverse hyperbolic tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex atanh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan2; + using cl::sycl::log; +#else + using std::atan2; + using std::log; +#endif + + const RealType i2 = x.imag() * x.imag(); + const RealType r = RealType(1.0) - i2 - x.real() * x.real(); + + RealType p = RealType(1.0) + x.real(); + RealType m = RealType(1.0) - x.real(); + + p = i2 + p * p; + m = i2 + m * m; + + RealType phi = atan2(RealType(2.0) * x.imag(), r); + return Kokkos::complex(RealType(0.25) * (log(p) - log(m)), + RealType(0.5) * phi); +} + +//! inverse sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex asin( + const complex& x) { + Kokkos::complex t = + asinh(Kokkos::complex(-x.imag(), x.real())); + return Kokkos::complex(t.imag(), -t.real()); +} + +//! inverse cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex acos( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::acos; + +#else + using std::acos; +#endif + Kokkos::complex t = asin(x); + RealType pi_2 = acos(RealType(0.0)); + return Kokkos::complex(pi_2 - t.real(), -t.imag()); +} + +//! inverse tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex atan( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan2; + using cl::sycl::log; +#else + using std::atan2; + using std::log; +#endif + const RealType r2 = x.real() * x.real(); + const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); + + RealType p = x.imag() + RealType(1.0); + RealType m = x.imag() - RealType(1.0); + + p = r2 + p * p; + m = r2 + m * m; + + return Kokkos::complex( + RealType(0.5) * atan2(RealType(2.0) * x.real(), i), + RealType(0.25) * log(p / m)); } /// This function cannot be called in a CUDA device function, diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 4989f2701c..2aba189487 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -196,6 +196,7 @@ KOKKOS_IMPL_IS_CONCEPT(index_type) KOKKOS_IMPL_IS_CONCEPT(launch_bounds) KOKKOS_IMPL_IS_CONCEPT(thread_team_member) KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member) +KOKKOS_IMPL_IS_CONCEPT(graph_kernel) } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index 78538dc7df..a27d5f0e47 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -292,6 +292,7 @@ struct ViewCopy { ViewTypeB b; using policy_type = Kokkos::RangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -301,7 +302,9 @@ struct ViewCopy { } KOKKOS_INLINE_FUNCTION - void operator()(const iType& i0) const { a(i0) = b(i0); }; + void operator()(const iType& i0) const { + a(i0) = static_cast(b(i0)); + }; }; template { Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = Kokkos::MDRangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -328,7 +332,7 @@ struct ViewCopy { KOKKOS_INLINE_FUNCTION void operator()(const iType& i0, const iType& i1) const { - a(i0, i1) = b(i0, i1); + a(i0, i1) = static_cast(b(i0, i1)); }; }; @@ -346,6 +350,7 @@ struct ViewCopy { Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = Kokkos::MDRangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -358,7 +363,7 @@ struct ViewCopy { KOKKOS_INLINE_FUNCTION void operator()(const iType& i0, const iType& i1, const iType& i2) const { - a(i0, i1, i2) = b(i0, i1, i2); + a(i0, i1, i2) = static_cast(b(i0, i1, i2)); }; }; @@ -1262,7 +1267,7 @@ inline void deep_copy( using ViewType = View; using exec_space_type = typename ViewType::execution_space; - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(ViewType::memory_space::name()), dst.label(), dst.data(), @@ -1272,7 +1277,7 @@ inline void deep_copy( if (dst.data() == nullptr) { Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1303,7 +1308,7 @@ inline void deep_copy( ViewTypeFlat::Rank, int64_t>(dst_flat, value, exec_space_type()); Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1359,7 +1364,7 @@ inline void deep_copy( } Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -1378,7 +1383,7 @@ inline void deep_copy( static_assert(src_traits::rank == 0, "ERROR: Non-rank-zero view in deep_copy( value , View )"); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), "Scalar", &dst, @@ -1389,7 +1394,7 @@ inline void deep_copy( if (src.data() == nullptr) { Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1397,7 +1402,7 @@ inline void deep_copy( Kokkos::Impl::DeepCopy(&dst, src.data(), sizeof(ST)); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -1424,7 +1429,7 @@ inline void deep_copy( typename src_type::non_const_value_type>::value, "deep_copy requires matching non-const destination type"); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -1435,7 +1440,7 @@ inline void deep_copy( if (dst.data() == nullptr && src.data() == nullptr) { Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1447,7 +1452,7 @@ inline void deep_copy( dst.data(), src.data(), sizeof(value_type)); Kokkos::fence(); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -1480,7 +1485,7 @@ inline void deep_copy( static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), "deep_copy requires Views of equal rank"); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -1518,7 +1523,7 @@ inline void deep_copy( Kokkos::Impl::throw_runtime_exception(message); } Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1545,7 +1550,7 @@ inline void deep_copy( ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) && (dst.span_is_contiguous() && src.span_is_contiguous())) { Kokkos::fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -1626,7 +1631,7 @@ inline void deep_copy( Impl::view_copy(dst, src); Kokkos::fence(); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -2424,7 +2429,7 @@ inline void deep_copy( typename dst_traits::value_type>::value, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -2441,7 +2446,7 @@ inline void deep_copy( Kokkos::Impl::ViewFill(dst, value, space); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -2464,7 +2469,7 @@ inline void deep_copy( typename dst_traits::value_type>::value, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -2484,7 +2489,7 @@ inline void deep_copy( fill_exec_space>(dst, value, fill_exec_space()); fill_exec_space().fence(); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -2498,12 +2503,12 @@ inline void deep_copy( typename std::enable_if< Kokkos::Impl::is_execution_space::value && std::is_same::specialize, - void>::value>::type* = 0) { + void>::value>::type* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, "ERROR: Non-rank-zero view in deep_copy( value , View )"); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), "(none)", &dst, @@ -2513,7 +2518,7 @@ inline void deep_copy( if (src.data() == nullptr) { exec_space.fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -2521,7 +2526,7 @@ inline void deep_copy( Kokkos::Impl::DeepCopy( exec_space, &dst, src.data(), sizeof(ST)); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -2548,7 +2553,7 @@ inline void deep_copy( typename src_traits::non_const_value_type>::value, "deep_copy requires matching non-const destination type"); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -2558,7 +2563,7 @@ inline void deep_copy( if (dst.data() == nullptr && src.data() == nullptr) { exec_space.fence(); - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -2569,7 +2574,7 @@ inline void deep_copy( exec_space, dst.data(), src.data(), sizeof(typename dst_traits::value_type)); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -2605,7 +2610,7 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { Kokkos::Profiling::beginDeepCopy( Kokkos::Profiling::make_space_handle(dst_memory_space::name()), dst.label(), dst.data(), @@ -2649,7 +2654,7 @@ inline void deep_copy( Kokkos::Impl::throw_runtime_exception(message); } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } return; @@ -2760,7 +2765,7 @@ inline void deep_copy( "deep_copy given views that would require a temporary allocation"); } } - if (Kokkos::Profiling::profileLibraryLoaded()) { + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { Kokkos::Profiling::endDeepCopy(); } } @@ -3221,7 +3226,7 @@ create_mirror_view_and_copy( using Mirror = typename Impl::MirrorViewType::view_type; std::string label = name.empty() ? src.label() : name; auto mirror = typename Mirror::non_const_type{ - ViewAllocateWithoutInitializing(label), src.layout()}; + view_alloc(WithoutInitializing, label), src.layout()}; deep_copy(mirror, src); return mirror; } @@ -3248,8 +3253,7 @@ typename Impl::MirrorViewType::view_type create_mirror_view( !Impl::MirrorViewType::is_same_memspace>::type* = nullptr) { using Mirror = typename Impl::MirrorViewType::view_type; - return Mirror(Kokkos::ViewAllocateWithoutInitializing(src.label()), - src.layout()); + return Mirror(view_alloc(WithoutInitializing, src.label()), src.layout()); } } /* namespace Kokkos */ diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index a1669addd6..4dac463a66 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -50,39 +50,13 @@ #include -#if defined(KOKKOS_ENABLE_SERIAL) -#include -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) -#include -#endif - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -#include -#include -#endif - -#if defined(KOKKOS_ENABLE_HPX) -#include -#endif - -#if defined(KOKKOS_ENABLE_THREADS) -#include -#endif - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -#if defined(KOKKOS_ENABLE_ROCM) -#include -#endif -#if defined(KOKKOS_ENABLE_HIP) -#include -#endif +// Fundamental type description for half precision +// Should not rely on other backend infrastructure +#include +#include #include +#include #include #include #include @@ -91,11 +65,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -108,16 +85,50 @@ struct InitArguments { int ndevices; int skip_device; bool disable_warnings; - - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false) + bool tune_internals; + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, + bool ti = false) : num_threads{nt}, num_numa{nn}, device_id{dv}, ndevices{-1}, skip_device{9999}, - disable_warnings{dw} {} + disable_warnings{dw}, + tune_internals{ti} {} }; +namespace Impl { + +/* ExecSpaceManager - Responsible for initializing all of the registered + * backends. Backends are registered using the register_space_initializer() + * function which should be called from a global context so that it is called + * prior to initialize_spaces() which is called from Kokkos::initialize() + */ +class ExecSpaceManager { + std::map> + exec_space_factory_list; + + public: + ExecSpaceManager() = default; + + void register_space_factory(std::string name, + std::unique_ptr ptr); + void initialize_spaces(const Kokkos::InitArguments& args); + void finalize_spaces(const bool all_spaces); + void static_fence(); + void print_configuration(std::ostream& msg, const bool detail); + static ExecSpaceManager& get_instance(); +}; + +template +int initialize_space_factory(std::string name) { + auto space_ptr = std::make_unique(); + ExecSpaceManager::get_instance().register_space_factory(name, + std::move(space_ptr)); + return 1; +} + +} // namespace Impl void initialize(int& narg, char* arg[]); void initialize(InitArguments args = InitArguments()); @@ -133,6 +144,7 @@ void post_initialize(const InitArguments& args); bool is_initialized() noexcept; bool show_warnings() noexcept; +bool tune_internals() noexcept; /** \brief Finalize the spaces that were initialized via Kokkos::initialize */ void finalize(); @@ -264,6 +276,8 @@ class ScopeGuard { // implementation of the RAII wrapper is using Kokkos::single. #include +// Specializations requires after core definitions +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7667dde4e6..7502719c73 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -87,58 +87,16 @@ namespace Kokkos { class HostSpace; ///< Memory space for main process and CPU execution spaces class AnonymousSpace; -#ifdef KOKKOS_ENABLE_HBWSPACE -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} -#endif - -#if defined(KOKKOS_ENABLE_SERIAL) -class Serial; ///< Execution space main process on CPU. -#endif - -#if defined(KOKKOS_ENABLE_HPX) -namespace Experimental { -class HPX; ///< Execution space with HPX back-end. -} -#endif - -#if defined(KOKKOS_ENABLE_THREADS) -class Threads; ///< Execution space with pthreads back-end. -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) -class OpenMP; ///< OpenMP execution space. -#endif - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -namespace Experimental { -class OpenMPTarget; ///< OpenMPTarget execution space. -class OpenMPTargetSpace; -} // namespace Experimental -#endif - -#if defined(KOKKOS_ENABLE_ROCM) -namespace Experimental { -class ROCmSpace; ///< Memory space on ROCm GPU -class ROCm; ///< Execution space for ROCm GPU -} // namespace Experimental -#endif - -#if defined(KOKKOS_ENABLE_HIP) -namespace Experimental { -class HIPSpace; ///< Memory space on HIP GPU -class HIP; ///< Execution space for HIP GPU -} // namespace Experimental -#endif - template struct Device; +// forward declare here so that backend initializer calls can use it. +struct InitArguments; + } // namespace Kokkos -#include "Cuda/Kokkos_Cuda_fwd.hpp" +// Include backend forward statements as determined by build options +#include //---------------------------------------------------------------------------- // Set the default execution space. @@ -168,9 +126,9 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::HIP; -#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::ROCm; + Experimental::SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = OpenMP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) @@ -182,7 +140,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -228,8 +186,8 @@ namespace Impl { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ defined(KOKKOS_ENABLE_CUDA) using ActiveExecutionMemorySpace = Kokkos::CudaSpace; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU) -using ActiveExecutionMemorySpace = Kokkos::HostSpace; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) +using ActiveExecutionMemorySpace = Kokkos::Experimental::SYCLDeviceUSMSpace; #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) using ActiveExecutionMemorySpace = Kokkos::Experimental::HIPSpace; #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) @@ -249,8 +207,17 @@ struct VerifyExecutionCanAccessMemorySpace { KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void *) {} }; + +// Base class for exec space initializer factories +class ExecSpaceInitializerBase; + } // namespace Impl +namespace Experimental { +template +class LogicalMemorySpace; +} + } // namespace Kokkos #define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index dfb884e514..4a573d82c0 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -287,7 +287,7 @@ void get_crs_transpose_counts( template typename OutRowMap::value_type get_crs_row_map_from_counts( OutRowMap& out, InCounts const& in, std::string const& name) { - out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1); + out = OutRowMap(view_alloc(WithoutInitializing, name), in.size() + 1); Kokkos::Impl::CrsRowMapFromCounts functor(in, out); return functor.execute(); } diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index a5b2182469..81e11f3f12 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -62,6 +62,7 @@ #include #include #include +#include /*--------------------------------------------------------------------------*/ @@ -270,6 +271,20 @@ struct DeviceTypeTraits { }; } // namespace Experimental } // namespace Tools + +namespace Impl { + +class CudaSpaceInitializer : public ExecSpaceInitializerBase { + public: + CudaSpaceInitializer() = default; + ~CudaSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool all_spaces) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -281,9 +296,9 @@ namespace Impl { template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = false }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; }; #if defined(KOKKOS_ENABLE_CUDA_UVM) @@ -297,9 +312,9 @@ struct MemorySpaceAccess struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = false }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; }; #endif @@ -307,7 +322,7 @@ struct MemorySpaceAccess struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -315,7 +330,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; inline static void verify(void) { CudaSpace::access_error(); } inline static void verify(const void* p) { CudaSpace::access_error(p); } }; diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index 0fb7841889..fc1c0e2f8a 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -100,6 +100,20 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -197,6 +211,20 @@ class CudaUVMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -254,6 +282,20 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -286,50 +328,50 @@ static_assert( template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // HostSpace::execution_space != CudaUVMSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // HostSpace::execution_space == CudaHostPinnedSpace::execution_space - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; //---------------------------------------- template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaSpace::execution_space == CudaUVMSpace::execution_space - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaSpace::execution_space != CudaHostPinnedSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; // CudaSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaSpace::execution_space + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -338,28 +380,28 @@ struct MemorySpaceAccess { template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; // Cuda cannot access HostSpace - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; // Cuda cannot access HostSpace + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaUVMSpace::execution_space == CudaSpace::execution_space // Can access CudaUVMSpace from Host but cannot access CudaSpace from Host - enum { assignable = false }; + enum : bool { assignable = false }; // CudaUVMSpace::execution_space can access CudaSpace - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; // CudaUVMSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaUVMSpace::execution_space + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -368,23 +410,23 @@ struct MemorySpaceAccess { template <> struct MemorySpaceAccess { - enum { assignable = false }; // Cannot access from Cuda - enum { accessible = true }; // CudaHostPinnedSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; // Cannot access from Cuda + enum : bool { accessible = true }; // CudaHostPinnedSpace::execution_space + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = false }; // Cannot access from Host - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; // Cannot access from Host + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = false }; // different execution_space - enum { accessible = true }; // same accessibility - enum { deepcopy = true }; + enum : bool { assignable = false }; // different execution_space + enum : bool { accessible = true }; // same accessibility + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -746,7 +788,7 @@ namespace Impl { template <> struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; KOKKOS_INLINE_FUNCTION static void verify(void) { Kokkos::abort("Cuda code attempted to access HostSpace memory"); } @@ -760,7 +802,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -769,7 +811,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -780,7 +822,7 @@ struct VerifyExecutionCanAccessMemorySpace< typename std::enable_if::value, Kokkos::CudaSpace>::type, OtherSpace> { - enum { value = false }; + enum : bool { value = false }; KOKKOS_INLINE_FUNCTION static void verify(void) { Kokkos::abort("Cuda code attempted to access unknown Space memory"); } @@ -795,7 +837,7 @@ struct VerifyExecutionCanAccessMemorySpace< template <> struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; inline static void verify(void) { CudaSpace::access_error(); } inline static void verify(const void* p) { CudaSpace::access_error(p); } }; @@ -804,7 +846,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; inline static void verify(void) {} inline static void verify(const void*) {} }; @@ -813,7 +855,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -844,7 +886,7 @@ class SharedAllocationRecord const unsigned sizeof_alias, void* const alloc_ptr, const size_t alloc_size); -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG static RecordBase s_root_record; #endif diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 17eef76038..3afe081701 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -123,13 +123,19 @@ class RangePolicy : public Impl::PolicyTraits { template RangePolicy(const RangePolicy& p) - : m_space(p.m_space), + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), m_begin(p.m_begin), m_end(p.m_end), m_granularity(p.m_granularity), m_granularity_mask(p.m_granularity_mask) {} - inline RangePolicy() : m_space(), m_begin(0), m_end(0) {} + inline RangePolicy() + : m_space(), + m_begin(0), + m_end(0), + m_granularity(0), + m_granularity_mask(0) {} /** \brief Total range */ inline RangePolicy(const typename traits::execution_space& work_space, @@ -358,6 +364,17 @@ class TeamPolicyInternal : public Impl::PolicyTraits { */ KOKKOS_INLINE_FUNCTION int team_size() const; + /** \brief Whether the policy has an automatically determined team size + */ + inline bool impl_auto_team_size() const; + /** \brief Whether the policy has an automatically determined vector length + */ + inline bool impl_auto_vector_length() const; + + static int vector_length_max(); + + KOKKOS_INLINE_FUNCTION int impl_vector_length() const; + inline typename traits::index_type chunk_size() const; inline TeamPolicyInternal& set_chunk_size(int chunk_size); @@ -554,6 +571,16 @@ class TeamPolicy : internal_policy(space_, league_size_request, Kokkos::AUTO(), vector_length_request) {} + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, Kokkos::AUTO(), + Kokkos::AUTO()) {} + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, team_size_request, + Kokkos::AUTO()) {} /** \brief Construct policy with the default instance of the execution space */ TeamPolicy(int league_size_request, int team_size_request, @@ -566,8 +593,20 @@ class TeamPolicy : internal_policy(league_size_request, Kokkos::AUTO(), vector_length_request) {} + TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {} + TeamPolicy(int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, team_size_request, + Kokkos::AUTO()) {} + template - TeamPolicy(const TeamPolicy p) : internal_policy(p) {} + TeamPolicy(const TeamPolicy p) : internal_policy(p) { + // Cannot call converting constructor in the member initializer list because + // it is not a direct base. + internal_policy::traits::operator=(p); + } private: TeamPolicy(const internal_policy& p) : internal_policy(p) {} @@ -869,32 +908,50 @@ namespace Impl { template struct PolicyPropertyAdaptor; -template +template class Policy, + class... Properties> struct PolicyPropertyAdaptor, - RangePolicy> { - using policy_in_t = RangePolicy; - using policy_out_t = - RangePolicy>; + Policy> { + using policy_in_t = Policy; + static_assert(is_execution_policy::value, ""); + using policy_out_t = Policy, + typename policy_in_t::traits::occupancy_control>; }; -template -struct PolicyPropertyAdaptor, - TeamPolicy> { - using policy_in_t = TeamPolicy; - using policy_out_t = - TeamPolicy>; +template