Merge remote-tracking branch 'github/develop' into refactor-testing

2022-10-14 07:29:38 -04:00
parent c312bf97ae 77740c4f07
commit 7d9076de4d
824 changed files with 49362 additions and 30151 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -105,7 +105,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
    if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
      set(CMAKE_TUNE_DEFAULT "-xCOMMON-AVX512")
    else()
-      set(CMAKE_TUNE_DEFAULT "-xHost")
+      set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=2196")
    endif()
  endif()
 endif()
@ -854,8 +854,11 @@ if(BUILD_SHARED_LIBS OR PKG_PYTHON)
    find_package(Python COMPONENTS Interpreter)
  endif()
  if(Python_EXECUTABLE)
-    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python)
-    install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} setup.py build -b ${CMAKE_BINARY_DIR}/python install --prefix=${CMAKE_INSTALL_PREFIX} --root=\$ENV{DESTDIR}/ WORKING_DIRECTORY ${LAMMPS_PYTHON_DIR})")
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python/lib)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python/src)
+    file(COPY ${LAMMPS_SOURCE_DIR}/version.h  DESTINATION ${CMAKE_BINARY_DIR}/python/src)
+    file(COPY ${LAMMPS_PYTHON_DIR}/README ${LAMMPS_PYTHON_DIR}/pyproject.toml ${LAMMPS_PYTHON_DIR}/setup.py ${LAMMPS_PYTHON_DIR}/lammps  DESTINATION ${CMAKE_BINARY_DIR}/python/lib)
+    install(CODE "if(\"\$ENV{DESTDIR}\" STREQUAL \"\")\n execute_process(COMMAND ${Python_EXECUTABLE} -m pip install -v ${CMAKE_BINARY_DIR}/python/lib --prefix=${CMAKE_INSTALL_PREFIX})\n else()\n execute_process(COMMAND ${Python_EXECUTABLE} -m pip install -v ${CMAKE_BINARY_DIR}/python/lib --prefix=${CMAKE_INSTALL_PREFIX} --root=\$ENV{DESTDIR})\n endif()")
  endif()
 endif()

--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -47,8 +47,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.6.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "0ec97fc0c356dd65bd2487defe81a7bf" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
  mark_as_advanced(KOKKOS_URL)
  mark_as_advanced(KOKKOS_MD5)
  ExternalProject_Add(kokkos_build
@ -72,7 +72,7 @@ if(DOWNLOAD_KOKKOS)
  add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
  add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.6.01 REQUIRED CONFIG)
+  find_package(Kokkos 3.7.00 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
  target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
--- a/cmake/Modules/Packages/ML-PACE.cmake
+++ b/cmake/Modules/Packages/ML-PACE.cmake
@ -1,6 +1,6 @@
-set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2021.10.25.fix2.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
+set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2022.09.27.fix10Oct.tar.gz" CACHE STRING "URL for PACE evaluator library sources")

-set(PACELIB_MD5 "32394d799bc282bb57696c78c456e64f" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
+set(PACELIB_MD5 "766cebcc0e5c4b8430c2f3cd202d9905" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
 mark_as_advanced(PACELIB_URL)
 mark_as_advanced(PACELIB_MD5)

@ -15,23 +15,9 @@ execute_process(
 )
 get_newest_file(${CMAKE_BINARY_DIR}/lammps-user-pace-* lib-pace)

-# enforce building libyaml-cpp as static library and turn off optional features
-set(YAML_BUILD_SHARED_LIBS OFF)
-set(YAML_CPP_BUILD_CONTRIB OFF)
-set(YAML_CPP_BUILD_TOOLS OFF)
-add_subdirectory(${lib-pace}/yaml-cpp build-yaml-cpp)
-set(YAML_CPP_INCLUDE_DIR ${lib-pace}/yaml-cpp/include)
-
-file(GLOB PACE_EVALUATOR_INCLUDE_DIR ${lib-pace}/ML-PACE)
-file(GLOB PACE_EVALUATOR_SOURCES ${lib-pace}/ML-PACE/*.cpp)
-list(FILTER PACE_EVALUATOR_SOURCES EXCLUDE REGEX pair_pace.cpp)
-
-add_library(pace STATIC ${PACE_EVALUATOR_SOURCES})
+add_subdirectory(${lib-pace} build-pace)
 set_target_properties(pace PROPERTIES CXX_EXTENSIONS ON OUTPUT_NAME lammps_pace${LAMMPS_MACHINE})
-target_include_directories(pace PUBLIC ${PACE_EVALUATOR_INCLUDE_DIR} ${YAML_CPP_INCLUDE_DIR})

-
-target_link_libraries(pace PRIVATE yaml-cpp-pace)
 if(CMAKE_PROJECT_NAME STREQUAL "lammps")
  target_link_libraries(lammps PRIVATE pace)
 endif()
--- a/cmake/presets/intel.cmake
+++ b/cmake/presets/intel.cmake
@ -1,4 +1,4 @@
-# preset that will enable Intel compilers with support for MPI and OpenMP (on Linux boxes)
+# preset that will enable the classic Intel compilers with support for MPI and OpenMP (on Linux boxes)

 set(CMAKE_CXX_COMPILER "icpc" CACHE STRING "" FORCE)
 set(CMAKE_C_COMPILER "icc" CACHE STRING "" FORCE)
@ -18,11 +18,11 @@ set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)

 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "icc" CACHE STRING "" FORCE)
-set(OpenMP_C_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "icpc" CACHE STRING "" FORCE)
-set(OpenMP_CXX_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
-set(OpenMP_Fortran_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_Fortran_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libiomp5.so" CACHE PATH "" FORCE)

--- a/cmake/presets/oneapi.cmake
+++ b/cmake/presets/oneapi.cmake
@ -18,11 +18,11 @@ set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)

 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "icx" CACHE STRING "" FORCE)
-set(OpenMP_C_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "icpx" CACHE STRING "" FORCE)
-set(OpenMP_CXX_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
-set(OpenMP_Fortran_FLAGS "-qopenmp" CACHE STRING "" FORCE)
+set(OpenMP_Fortran_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libiomp5.so" CACHE PATH "" FORCE)

--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -483,6 +483,9 @@ They must be specified in uppercase.
   *  - **Arch-ID**
      - **HOST or GPU**
      - **Description**
+   *  - NATIVE
+      - HOST
+      - Local machine
   *  - AMDAVX
      - HOST
      - AMD 64-bit x86 CPU (AVX 1)
@ -522,9 +525,21 @@ They must be specified in uppercase.
   *  - BDW
      - HOST
      - Intel Broadwell Xeon E-class CPU (AVX 2 + transactional mem)
+   *  - SKL
+      - HOST
+      - Intel Skylake Client CPU
   *  - SKX
      - HOST
-      - Intel Sky Lake Xeon E-class HPC CPU (AVX512 + transactional mem)
+      - Intel Skylake Xeon Server CPU (AVX512)
+   *  - ICL
+      - HOST
+      - Intel Ice Lake Client CPU (AVX512)
+   *  - ICX
+      - HOST
+      - Intel Ice Lake Xeon Server CPU (AVX512)
+   *  - SPR
+      - HOST
+      - Intel Sapphire Rapids Xeon Server CPU (AVX512)
   *  - KNC
      - HOST
      - Intel Knights Corner Xeon Phi
@ -596,7 +611,10 @@ They must be specified in uppercase.
      - AMD GPU MI100 GFX908
   *  - VEGA90A
      - GPU
-      - AMD GPU
+      - AMD GPU MI200 GFX90A
+   *  - INTEL_GEN
+      - GPU
+      - SPIR64-based devices, e.g. Intel GPUs, using JIT
   *  - INTEL_DG1
      - GPU
      - Intel Iris XeMAX GPU
@ -611,9 +629,12 @@ They must be specified in uppercase.
      - Intel GPU Gen12LP
   *  - INTEL_XEHP
      - GPU
-      - Intel GPUs Xe-HP
+      - Intel GPU Xe-HP
+   *  - INTEL_PVC
+      - GPU
+      - Intel GPU Ponte Vecchio

-This list was last updated for version 3.5.0 of the Kokkos library.
+This list was last updated for version 3.7.0 of the Kokkos library.

 .. tabs::

--- a/doc/src/Build_manual.rst
+++ b/doc/src/Build_manual.rst
@ -216,7 +216,7 @@ be multiple tests run automatically:
 - A test that only standard, printable ASCII text characters are used.
  This runs the command ``env LC_ALL=C grep -n '[^ -~]' src/*.rst`` and
  thus prints all offending lines with filename and line number
-  prepended to the screen.  Special characters like greek letters
+  prepended to the screen.  Special characters like Greek letters
  (:math:`\alpha~~\sigma~~\epsilon`), super- or subscripts
  (:math:`x^2~~\mathrm{U}_{LJ}`), mathematical expressions
  (:math:`\frac{1}{2}\mathrm{N}~~x\to\infty`), or the Angstrom symbol
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -236,6 +236,7 @@ OPT.
   * :doc:`oxrna2/xstk <pair_oxrna2>`
   * :doc:`oxrna2/coaxstk <pair_oxrna2>`
   * :doc:`pace (k) <pair_pace>`
+   * :doc:`pace/extrapolation <pair_pace>`
   * :doc:`peri/eps <pair_peri>`
   * :doc:`peri/lps (o) <pair_peri>`
   * :doc:`peri/pmb (o) <pair_peri>`
--- a/doc/src/Fortran.rst
+++ b/doc/src/Fortran.rst
--- a/doc/src/Library_objects.rst
+++ b/doc/src/Library_objects.rst
@ -6,6 +6,7 @@ fixes, or variables in LAMMPS using the following functions:

 - :cpp:func:`lammps_extract_compute`
 - :cpp:func:`lammps_extract_fix`
+- :cpp:func:`lammps_extract_variable_datatype`
 - :cpp:func:`lammps_extract_variable`
 - :cpp:func:`lammps_set_variable`

@ -21,6 +22,11 @@ fixes, or variables in LAMMPS using the following functions:

 -----------------------

+.. doxygenfunction:: lammps_extract_variable_datatype
+   :project: progguide
+
+-----------------------
+
 .. doxygenfunction:: lammps_extract_variable
   :project: progguide

@ -36,3 +42,5 @@ fixes, or variables in LAMMPS using the following functions:
 .. doxygenenum:: _LMP_STYLE_CONST

 .. doxygenenum:: _LMP_TYPE_CONST
+
+.. doxygenenum:: _LMP_VAR_CONST
--- a/doc/src/Library_properties.rst
+++ b/doc/src/Library_properties.rst
@ -16,8 +16,8 @@ This section documents the following functions:
 --------------------

 The library interface allows the extraction of different kinds of
-information about the active simulation instance and also - in some
-cases - to apply modifications to it.  This enables combining of a
+information about the active simulation instance and also---in some
+cases---to apply modifications to it.  This enables combining of a
 LAMMPS simulation with other processing and simulation methods computed
 by the calling code, or by another code that is coupled to LAMMPS via
 the library interface.  In some cases the data returned is direct
@ -25,9 +25,9 @@ reference to the original data inside LAMMPS, cast to a void pointer.
 In that case the data needs to be cast to a suitable pointer for the
 calling program to access it, and you may need to know the correct
 dimensions and lengths.  This also means you can directly change those
-value(s) from the calling program, e.g. to modify atom positions.  Of
-course, this should be done with care.  When accessing per-atom data,
-please note that this data is the per-processor **local** data and is
+value(s) from the calling program (e.g., to modify atom positions).  Of
+course, changing values should be done with care.  When accessing per-atom
+data, please note that these data are the per-processor **local** data and are
 indexed accordingly. Per-atom data can change sizes and ordering at
 every neighbor list rebuild or atom sort event as atoms migrate between
 sub-domains and processors.
--- a/doc/src/Run_basics.rst
+++ b/doc/src/Run_basics.rst
@ -30,12 +30,13 @@ executable itself can be placed elsewhere.

 .. note::

-   The redirection operator "<" will not always work when running
-   in parallel with mpirun or mpiexec; for those systems the -in form is required.
+   The redirection operator "<" will not always work when running in
+   parallel with ``mpirun`` or ``mpiexec``; for those systems the -in
+   form is required.

 As LAMMPS runs it prints info to the screen and a logfile named
-*log.lammps*\ .  More info about output is given on the
-:doc:`screen and logfile output <Run_output>` page.
+*log.lammps*\ .  More info about output is given on the :doc:`screen and
+logfile output <Run_output>` page.

 If LAMMPS encounters errors in the input script or while running a
 simulation it will print an ERROR message and stop or a WARNING
--- a/doc/src/Run_options.rst
+++ b/doc/src/Run_options.rst
@ -93,13 +93,13 @@ switch is not set (the default), LAMMPS will operate as if the KOKKOS
 package were not installed; i.e. you can run standard LAMMPS or with
 the GPU or OPENMP packages, for testing or benchmarking purposes.

-Additional optional keyword/value pairs can be specified which
-determine how Kokkos will use the underlying hardware on your
-platform.  These settings apply to each MPI task you launch via the
-"mpirun" or "mpiexec" command.  You may choose to run one or more MPI
-tasks per physical node.  Note that if you are running on a desktop
-machine, you typically have one physical node.  On a cluster or
-supercomputer there may be dozens or 1000s of physical nodes.
+Additional optional keyword/value pairs can be specified which determine
+how Kokkos will use the underlying hardware on your platform.  These
+settings apply to each MPI task you launch via the ``mpirun`` or
+``mpiexec`` command.  You may choose to run one or more MPI tasks per
+physical node.  Note that if you are running on a desktop machine, you
+typically have one physical node.  On a cluster or supercomputer there
+may be dozens or 1000s of physical nodes.

 Either the full word or an abbreviation can be used for the keywords.
 Note that the keywords do not use a leading minus sign.  I.e. the
@ -148,9 +148,9 @@ one of these 4 environment variables
   MV2_COMM_WORLD_LOCAL_RANK (Mvapich)
   OMPI_COMM_WORLD_LOCAL_RANK (OpenMPI)

-which are initialized by the "srun", "mpirun" or "mpiexec" commands.
-The environment variable setting for each MPI rank is used to assign a
-unique GPU ID to the MPI task.
+which are initialized by the ``srun``, ``mpirun``, or ``mpiexec``
+commands.  The environment variable setting for each MPI rank is used to
+assign a unique GPU ID to the MPI task.

 .. parsed-literal::

--- a/doc/src/Speed_gpu.rst
+++ b/doc/src/Speed_gpu.rst
@ -76,10 +76,11 @@ instructions.

 **Run with the GPU package from the command line:**

-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command in MPICH does this via
-its -np and -ppn switches.  Ditto for OpenMPI via -np and -npernode.
+The ``mpirun`` or ``mpiexec`` command sets the total number of MPI tasks
+used by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the ``mpirun`` command in MPICH does this via
+its ``-np`` and ``-ppn`` switches.  Ditto for OpenMPI via ``-np`` and
+``-npernode``.

 When using the GPU package, you cannot assign more than one GPU to a
 single MPI task.  However multiple MPI tasks can share the same GPU,
@ -129,8 +130,8 @@ GPU package pair styles.

 **Or run with the GPU package by editing an input script:**

-The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and use of multiple MPI tasks/GPU is the same.
+The discussion above for the ``mpirun`` or ``mpiexec`` command, MPI
+tasks/node, and use of multiple MPI tasks/GPU is the same.

 Use the :doc:`suffix gpu <suffix>` command, or you can explicitly add an
 "gpu" suffix to individual styles in your input script, e.g.
--- a/doc/src/Speed_kokkos.rst
+++ b/doc/src/Speed_kokkos.rst
@ -72,12 +72,12 @@ See the :ref:`Build extras <kokkos>` page for instructions.
 Running LAMMPS with the KOKKOS package
 """"""""""""""""""""""""""""""""""""""

-All Kokkos operations occur within the context of an individual MPI
-task running on a single node of the machine. The total number of MPI
-tasks used by LAMMPS (one or multiple per compute node) is set in the
-usual manner via the mpirun or mpiexec commands, and is independent of
-Kokkos. E.g. the mpirun command in OpenMPI does this via its -np and
-npernode switches. Ditto for MPICH via -np and -ppn.
+All Kokkos operations occur within the context of an individual MPI task
+running on a single node of the machine. The total number of MPI tasks
+used by LAMMPS (one or multiple per compute node) is set in the usual
+manner via the ``mpirun`` or ``mpiexec`` commands, and is independent of
+Kokkos. E.g. the mpirun command in OpenMPI does this via its ``-np`` and
+``-npernode`` switches. Ditto for MPICH via ``-np`` and ``-ppn``.

 Running on a multi-core CPU
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -168,7 +168,7 @@ for your MPI installation), binding can be forced with these flags:

 .. parsed-literal::

-   OpenMPI 1.8: mpirun -np 2 --bind-to socket --map-by socket ./lmp_openmpi ...
+   OpenMPI 1.8:  mpirun -np 2 --bind-to socket --map-by socket ./lmp_openmpi ...
   Mvapich2 2.0: mpiexec -np 2 --bind-to socket --map-by socket ./lmp_mvapich ...

 For binding threads with KOKKOS OpenMP, use thread affinity environment
@ -310,7 +310,8 @@ Alternatively the effect of the "-sf" or "-pk" switches can be
 duplicated by adding the :doc:`package kokkos <package>` or :doc:`suffix kk <suffix>` commands to your input script.

 The discussion above for building LAMMPS with the KOKKOS package, the
-mpirun/mpiexec command, and setting appropriate thread are the same.
+``mpirun`` or ``mpiexec`` command, and setting appropriate thread
+properties are the same.

 You must still use the "-k on" :doc:`command-line switch <Run_options>`
 to enable the KOKKOS package, and specify its additional arguments for
--- a/doc/src/Speed_omp.rst
+++ b/doc/src/Speed_omp.rst
@ -33,8 +33,8 @@ These examples assume one or more 16-core nodes.
   mpirun -np 4 lmp_omp -sf omp -pk omp 4 -in in.script           # 4 MPI tasks, 4 threads/task
   mpirun -np 32 -ppn 4 lmp_omp -sf omp -pk omp 4 -in in.script   # 8 nodes, 4 MPI tasks/node, 4 threads/task

-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
+The ``mpirun`` or ``mpiexec`` command sets the total number of MPI tasks
+used by LAMMPS (one or multiple per compute node) and the number of MPI
 tasks used per node.  E.g. the mpirun command in MPICH does this via
 its -np and -ppn switches.  Ditto for OpenMPI via -np and -npernode.

@ -58,8 +58,8 @@ OMP_NUM_THREADS environment variable.
 Or run with the OPENMP package by editing an input script
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""

-The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and threads/MPI task is the same.
+The discussion above for the ``mpirun`` or ``mpiexec`` command, MPI
+tasks/node, and threads/MPI task is the same.

 Use the :doc:`suffix omp <suffix>` command, or you can explicitly add an
 "omp" suffix to individual styles in your input script, e.g.
--- a/doc/src/atom_style.rst
+++ b/doc/src/atom_style.rst
@ -91,7 +91,7 @@ quantities.
 +--------------+-----------------------------------------------------+--------------------------------------+
 | *charge*     | charge                                              | atomic system with charges           |
 +--------------+-----------------------------------------------------+--------------------------------------+
-| *dielectric* | dipole, area, curvature                             | system with surface polarization     |
+| *dielectric* | normx normy normz area/patch ed em epsilon curv     | system with surface polarization     |
 +--------------+-----------------------------------------------------+--------------------------------------+
 | *dipole*     | charge and dipole moment                            | system with dipolar particles        |
 +--------------+-----------------------------------------------------+--------------------------------------+
@ -180,16 +180,21 @@ vector with the 3 diameters of the ellipsoid and a quaternion 4-vector
 with its orientation.

 For the *dielectric* style, each particle can be either a physical
-particle (e.g. an ion), or an interface particle representing a
-boundary element. For physical particles, the per-particle properties
-are the same as atom_style full.  For interface particles, in addition
-to these properties, each particle also has an area, a normal unit
-vector, a mean local curvature, the mean and difference of the
-dielectric constants of two sides of the interface, and the local
-dielectric constant at the boundary element.  The distinction between
-the physical and interface particles is only meaningful when :doc:`fix
-polarize <fix_polarize>` commands are applied to the interface
-particles.
+particle (e.g. an ion), or an interface particle representing a boundary
+element between two regions of different dielectric constant. For
+interface particles, in addition to the properties associated with
+atom_style full, each particle also should be assigned a normal unit
+vector (defined by normx, normy, normz), an area (area/patch), the
+difference and mean of the dielectric constants of two sides of the
+interface along the direction of the normal vector (ed and em), the
+local dielectric constant at the boundary element (epsilon), and a mean
+local curvature (curv).  Physical particles must be assigned these
+values, as well, but only their local dielectric constants will be used;
+see documentation for associated :doc:`pair styles <pair_dielectric>`
+and :doc:`fixes <fix_polarize>`.  The distinction between the physical
+and interface particles is only meaningful when :doc:`fix polarize
+<fix_polarize>` commands are applied to the interface particles. This
+style is part of the DIELECTRIC package.

 For the *dipole* style, a point dipole is defined for each point
 particle.  Note that if you wish the particles to be finite-size
--- a/doc/src/fix_polarize.rst
+++ b/doc/src/fix_polarize.rst
@ -16,11 +16,11 @@ Syntax

 .. parsed-literal::

-   fix ID group-ID style nevery tolerance ...
+   fix ID group-ID style nevery tolerance

 * ID, group-ID are documented in :doc:`fix <fix>` command
 * style = *polarize/bem/gmres* or *polarize/bem/icc* or *polarize/functional*
-* Nevery = this fixed is invoked every this many timesteps
+* nevery = this fixed is invoked every this many timesteps
 * tolerance = the relative tolerance for the iterative solver to stop


@ -46,44 +46,53 @@ Description

 These fixes compute induced charges at the interface between two
 impermeable media with different dielectric constants. The interfaces
-need to be discretized into vertices, each representing a boundary element.
-The vertices are treated as if they were regular atoms or particles.
-:doc:`atom_style dielectric <atom_style>` should be used since it defines
-the additional properties of each interface particle such as
-interface normal vectors, element areas, and local dielectric mismatch.
-These fixes also require the use of :doc:`pair_style <pair_style>` and
-:doc:`kspace_style <kspace_style>` with the *dielectric* suffix.
-At every time step, given a configuration of the physical charges in the system
-(such as atoms and charged particles) these fixes compute and update
-the charge of the interface particles. The interfaces are allowed to move
-during the simulation with appropriate time integrators (for example,
-with :doc:`fix_rigid <fix_rigid>`).
+need to be discretized into vertices, each representing a boundary
+element.  The vertices are treated as if they were regular atoms or
+particles.  :doc:`atom_style dielectric <atom_style>` should be used
+since it defines the additional properties of each interface particle
+such as interface normal vectors, element areas, and local dielectric
+mismatch.  These fixes also require the use of :doc:`pair_style
+<pair_style>` and :doc:`kspace_style <kspace_style>` with the
+*dielectric* suffix.  At every time step, given a configuration of the
+physical charges in the system (such as atoms and charged particles)
+these fixes compute and update the charge of the interface
+particles. The interfaces are allowed to move during the simulation if
+the appropriate time integrators are also set (for example, with
+:doc:`fix_rigid <fix_rigid>`).

-Consider an interface between two media: one with dielectric constant
-of 78 (water), the other of 4 (silica). The interface is discretized
-into 2000 boundary elements, each represented by an interface particle. Suppose that
-each interface particle has a normal unit vector pointing from the silica medium to water.
-The dielectric difference along the normal vector is then 78 - 4 = 74,
-the mean dielectric value is (78 + 4) / 2 = 41. Each boundary element
-also has its area and the local mean curvature (which is used by these fixes
-for computing a correction term in the local electric field).
-To model charged interfaces, the interface particle will have a non-zero charge value,
+Consider an interface between two media: one with dielectric constant of
+78 (water), the other of 4 (silica). The interface is discretized into
+2000 boundary elements, each represented by an interface
+particle. Suppose that each interface particle has a normal unit vector
+pointing from the silica medium to water.  The dielectric difference
+along the normal vector is then 78 - 4 = 74, the mean dielectric value
+is (78 + 4) / 2 = 41. Each boundary element also has its area and the
+local mean curvature, which is used by these fixes for computing a
+correction term in the local electric field.  To model charged
+interfaces, the interface particle will have a non-zero charge value,
 coming from its area and surface charge density.

-For non-interface particles such as atoms and charged particles,
-the interface normal vectors, element area, and dielectric mismatch are
-irrelevant. Their local dielectric value is used to rescale their actual charge
-when computing the Coulombic interactions. For instance, for a cation carrying
-a charge of +2 (in charge unit) in an implicit solvent with dielectric constant of 40
-would have actual charge of +2, and a local dielectric constant value of 40.
-It is assumed that the particles cannot pass through the interface during the simulation
-so that its local dielectric constant value does not change.
+For non-interface particles such as atoms and charged particles, the
+interface normal vectors, element area, and dielectric mismatch are
+irrelevant and unused. Their local dielectric value is used internally
+to rescale their given charge when computing the Coulombic
+interactions. For instance, to simulate a cation carrying a charge of +2
+(in simulation charge units) in an implicit solvent with a dielectric
+constant of 40, the cation's charge should be set to +2 and its local
+dielectric constant property (defined in the :doc:`atom_style dielectric
+<atom_style>`) should be set to 40; there is no need to manually rescale
+charge. This will produce the proper force for any :doc:`pair_style
+<pair_style>` with the dielectric suffix.  It is assumed that the
+particles cannot pass through the interface during the simulation
+because the value of the local dielectric constant property does not
+change.

-There are some example scripts for using these fixes
-with LAMMPS in the ``examples/PACKAGES/dielectric`` directory. The README file
-therein contains specific details on the system setup. Note that the example data files
-show the additional fields (columns) needed for :doc:`atom_style dielectric <atom_style>`
-beyond the conventional fields *id*, *mol*, *type*, *q*, *x*, *y*, and *z*.
+There are some example scripts for using these fixes with LAMMPS in the
+``examples/PACKAGES/dielectric`` directory. The README file therein
+contains specific details on the system setup. Note that the example
+data files show the additional fields (columns) needed for
+:doc:`atom_style dielectric <atom_style>` beyond the conventional fields
+*id*, *mol*, *type*, *q*, *x*, *y*, and *z*.

 ----------

@ -104,22 +113,24 @@ the interface, are computed using the equation:
 * :math:`\mathbf{E}(\mathbf{s})` is the electrical field at the vertex
 * :math:`\mathbf{n}(\mathbf{s})` is the unit normal vector at the vertex pointing from medium with :math:`\epsilon_2` to that with :math:`\epsilon_1`

-Fix *polarize/bem/gmres* employs the Generalized Minimum Residual (GMRES)
-as described in :ref:`(Barros) <Barros>` to solve :math:`\sigma_b`.
+Fix *polarize/bem/gmres* employs the Generalized Minimum Residual
+(GMRES) as described in :ref:`(Barros) <Barros>` to solve
+:math:`\sigma_b`.

 Fix *polarize/bem/icc* employs the successive over-relaxation algorithm
 as described in :ref:`(Tyagi) <Tyagi>` to solve :math:`\sigma_b`.

-The iterative solvers would terminate either when the maximum relative change
-in the induced charges in consecutive iterations is below the set tolerance,
-or when the number of iterations reaches *iter_max* (see below).
+The iterative solvers would terminate either when the maximum relative
+change in the induced charges in consecutive iterations is below the set
+tolerance, or when the number of iterations reaches *iter_max* (see
+below).

-Fix *polarize/functional* employs the energy functional variation approach
-as described in :ref:`(Jadhao) <Jadhao>` to solve :math:`\sigma_b`.
+Fix *polarize/functional* employs the energy functional variation
+approach as described in :ref:`(Jadhao) <Jadhao>` to solve
+:math:`\sigma_b`.

-
-More details on the implementation of these fixes and their recommended use
-are described in :ref:`(NguyenTD) <NguyenTD>`.
+More details on the implementation of these fixes and their recommended
+use are described in :ref:`(NguyenTD) <NguyenTD>`.


 Restart, fix_modify, output, run start/stop, minimize info
@ -127,35 +138,78 @@ Restart, fix_modify, output, run start/stop, minimize info

 No information about this fix is written to :doc:`binary restart files <restart>`.

-The :doc:`fix_modify <fix_modify>` command provides certain options to
-control the induced charge solver and the initial values of the interface elements:
+The :doc:`fix_modify <fix_modify>` command provides the ability to modify certain
+settings:

  .. parsed-literal::
      *itr_max* arg
         arg = maximum number of iterations for convergence
      *dielectrics* ediff emean epsilon area charge
-         ediff = dielectric difference
-         emean = dielectric mean
-         epsilon = local dielectric value
-         aree = element area
-         charge = real interface charge
+         ediff = dielectric difference or NULL
+         emean = dielectric mean or NULL
+         epsilon = local dielectric value or NULL
+         area = element area or NULL
+         charge = real interface charge or NULL
+      *kspace* arg = yes or no
+      *rand* max seed
+         max = range of random induced charges to be generated
+         seed = random number seed to use when generating random charge
+      *mr* arg
+         arg = maximum number of q-vectors to use when solving (GMRES only)
+      *omega* arg
+         arg = relaxation parameter to use when iterating (ICC only)

-*polarize/bem/gmres* or *polarize/bem/icc* compute a global 2-element vector
-which can be accessed by various :doc:`output commands <Howto_output>`.
-The first element is the number of iterations when the solver terminates
-(of which the upper bound is set by *iter_max*). The second element is the RMS error.
+The *itr_max* keyword sets the max number of iterations to be used for
+solving each step.
+
+The *dielectrics* keyword allows properties of the atoms in group
+*group-ID* to be modified. Values passed to any of the arguments
+(*ediff*, *emean*, *epsilon*, *area*, *charge*) will override existing
+values for all atoms in the group *group-ID*. Passing NULL to any of
+these arguments will preserve the existing value. Note that setting the
+properties of the interface this way will change the properties of all
+atoms associated with the fix (all atoms in *group-ID*), so multiple fix
+and fix_modify commands would be needed to change the properties of two
+different interfaces to different values (one fix and fix_modify for
+each interface group).
+
+The *kspace* keyword turns on long range interactions.
+
+If the arguments of the *rand* keyword are set, then the atoms subject
+to this fix will be assigned a random initial charge in a uniform
+distribution from -*max*/2 to *max*/2, using random number seed *seed*.
+
+The *mr* keyword only applies to *style* = *polarize/bem/gmres*. It is
+the maximum number of q-vectors to use when solving for the surface
+charge.
+
+The *omega* keyword only applies when using *style* =
+*polarize/bem/icc*. It is a relaxation parameter defined in
+:ref:`(Tyagi) <Tyagi>` that should generally be set between 0 and 2.
+
+Note that the local dielectric constant (epsilon) can also be set
+independently using the :doc:`set <set>` command.
+
+----------
+
+*polarize/bem/gmres* or *polarize/bem/icc* compute a global 2-element
+vector which can be accessed by various :doc:`output commands
+<Howto_output>`.  The first element is the number of iterations when the
+solver terminates (of which the upper bound is set by *iter_max*). The
+second element is the RMS error.


 Restrictions
 """"""""""""

-These fixes are part of the DIELECTRIC package.  It is only enabled
+These fixes are part of the DIELECTRIC package.  They are only enabled
 if LAMMPS was built with that package, which requires that also the
 KSPACE package is installed.  See the :doc:`Build package
 <Build_package>` page for more info.

-Note that the *polarize/bem/gmres* and *polarize/bem/icc* fixes only support
-:doc:`units <units>` *lj*, *real*, *metal*, *si* and *nano* at the moment.
+Note that the *polarize/bem/gmres* and *polarize/bem/icc* fixes only
+support :doc:`units <units>` *lj*, *real*, *metal*, *si* and *nano* at
+the moment.


 Related commands
@ -171,6 +225,15 @@ Default

 *iter_max* = 20

+*kspace* = yes
+
+*omega* = 0.7 (ICC only)
+
+*mr* = \# atoms in group *group-ID* minus 1 (GMRES only)
+
+No random charge initialization happens by default.
+
+
 ----------

 .. _Barros:
--- a/doc/src/pair_dielectric.rst
+++ b/doc/src/pair_dielectric.rst
@ -76,16 +76,19 @@ Description
 """""""""""

 All these pair styles are derived from the corresponding pair styles
-without the *dielectric*\ suffix. In addition to computing atom forces
-and energies, these pair styles compute the electrical field vector
-at each atom, which are to be used in the :doc:`fix polarize <fix_polarize>` commands.
+without the *dielectric* suffix. In addition to computing atom forces
+and energies, these pair styles compute the electric field vector at
+each atom, which are intended to be used by the :doc:`fix polarize
+<fix_polarize>` commands to compute induced charges at interfaces
+between two regions of different dielectric constant.

-These pair styles should be used with :doc:`atom_style dielectric <atom_style>`,
-which uses atom charges rescaled by their local dielectric constant.
+These pair styles should be used with :doc:`atom_style dielectric
+<atom_style>`.

 The styles lj/cut/coul/long/dielectric, lj/cut/coul/msm/dielectric, and
-lj/long/coul/long/dielectric should be used with their kspace style counterparts,
-namely, pppm/dielectric, pppm/disp/dielectric, and msm/dielectric, respectively.
+lj/long/coul/long/dielectric should be used with their kspace style
+counterparts, namely, pppm/dielectric, pppm/disp/dielectric, and
+msm/dielectric, respectively.

 ----------

@ -97,24 +100,27 @@ Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""

 For atom type pairs I,J and I != J, the epsilon and sigma coefficients
-and cutoff distances for this pair style can be mixed.  The default
-mix value is *geometric*\ .  See the "pair_modify" command for details.
+and cutoff distances for this pair style can be mixed.  The default mix
+algorithm is *geometric*\ .  See the :doc:`pair_modify <pair_modify>`"
+command for details.

 The :doc:`pair_modify <pair_modify>` table option is not relevant
 for this pair style.

-This pair style writes its information to :doc:`binary restart files <restart>`, so pair_style and pair_coeff commands do not need
-to be specified in an input script that reads a restart file.
+These pair styles write its information to :doc:`binary restart files
+<restart>`, so pair_style and pair_coeff commands do not need to be
+specified in an input script that reads a restart file.

-This pair style can only be used via the *pair* keyword of the
+These pair styles can only be used via the *pair* keyword of the
 :doc:`run_style respa <run_style>` command.  It does not support the
 *inner*, *middle*, *outer* keywords.

 Restrictions
 """"""""""""

-These styles are part of the DIELECTRIC package.  They are only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+These styles are part of the DIELECTRIC package.  They are only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.

 Related commands
 """"""""""""""""
--- a/doc/src/pair_pace.rst
+++ b/doc/src/pair_pace.rst
@ -1,11 +1,15 @@
 .. index:: pair_style pace
 .. index:: pair_style pace/kk
+.. index:: pair_style pace/extrapolation

 pair_style pace command
 =======================

 Accelerator Variants: *pace/kk*

+pair_style pace/extrapolation command
+=====================================
+
 Syntax
 """"""

@ -22,6 +26,10 @@ Syntax
       *recursive* = use recursive algorithm for basis functions
       *chunksize* value = number of atoms in each pass

+.. code-block:: LAMMPS
+
+   pair_style pace/extrapolation
+
 Examples
 """"""""

@ -31,50 +39,96 @@ Examples
   pair_style pace product chunksize 2048
   pair_coeff * * Cu-PBE-core-rep.ace Cu

+   pair_style pace/extrapolation
+   pair_coeff * * Cu.yaml Cu.asi Cu
+
 Description
 """""""""""

 Pair style *pace* computes interactions using the Atomic Cluster
 Expansion (ACE), which is a general expansion of the atomic energy in
-multi-body basis functions. :ref:`(Drautz) <Drautz20191>`.
-The *pace* pair style
-provides an efficient implementation that
-is described in this paper :ref:`(Lysogorskiy) <Lysogorskiy20211>`.
+multi-body basis functions. :ref:`(Drautz) <Drautz20191>`.  The *pace*
+pair style provides an efficient implementation that is described in
+this paper :ref:`(Lysogorskiy) <Lysogorskiy20211>`.

-In ACE, the total energy is decomposed into a sum over
-atomic energies. The energy of atom *i* is expressed as a
-linear or non-linear function of one or more density functions.
-By projecting the
-density onto a local atomic base, the lowest order contributions
-to the energy can be expressed as a set of scalar polynomials in
-basis function contributions summed over neighbor atoms.
+In ACE, the total energy is decomposed into a sum over atomic
+energies. The energy of atom *i* is expressed as a linear or non-linear
+function of one or more density functions.  By projecting the density
+onto a local atomic base, the lowest order contributions to the energy
+can be expressed as a set of scalar polynomials in basis function
+contributions summed over neighbor atoms.

 Only a single pair_coeff command is used with the *pace* style which
 specifies an ACE coefficient file followed by N additional arguments
-specifying the mapping of ACE elements to LAMMPS atom types,
-where N is the number of LAMMPS atom types:
+specifying the mapping of ACE elements to LAMMPS atom types, where N is
+the number of LAMMPS atom types:

 * ACE coefficient file
 * N element names = mapping of ACE elements to atom types

 Only a single pair_coeff command is used with the *pace* style which
-specifies an ACE file that fully defines the potential.
-Note that unlike for other potentials, cutoffs are
-not set in the pair_style or pair_coeff command; they are specified in
-the ACE file.
+specifies an ACE file that fully defines the potential.  Note that
+unlike for other potentials, cutoffs are not set in the pair_style or
+pair_coeff command; they are specified in the ACE file.

 The pair_style *pace* command may be followed by the optional keyword
-*product* or *recursive*, which determines which of two algorithms
-is used for the calculation of basis functions and derivatives.
-The default is *recursive*.
+*product* or *recursive*, which determines which of two algorithms is
+used for the calculation of basis functions and derivatives.  The
+default is *recursive*.

-The keyword *chunksize* is only applicable when
-using the pair style *pace* with the KOKKOS package on GPUs and is
-ignored otherwise.  This keyword controls the number of atoms
-in each pass used to compute the atomic cluster expansion and is used to
-avoid running out of memory.  For example if there are 8192 atoms in the
-simulation and the *chunksize* is set to 4096, the ACE
-calculation will be broken up into two passes (running on a single GPU).
+The keyword *chunksize* is only applicable when using the pair style
+*pace* with the KOKKOS package on GPUs and is ignored otherwise.  This
+keyword controls the number of atoms in each pass used to compute the
+atomic cluster expansion and is used to avoid running out of memory.
+For example if there are 8192 atoms in the simulation and the
+*chunksize* is set to 4096, the ACE calculation will be broken up into
+two passes (running on a single GPU).
+
+Extrapolation grade
+"""""""""""""""""""
+
+Calculation of extrapolation grade in PACE is implemented in `pair_style
+pace/extrapolation`.  It is based on the MaxVol algorithm similar to
+Moment Tensor Potential (MTP) by Shapeev et al.  and is described in
+:ref:`(Lysogorskiy2) <Lysogorskiy2022>`.  In order to compute
+extrapolation grade one needs to provide:
+
+#. ACE potential in B-basis form (`.yaml` format) and
+#. Active Set Inverted (ASI) file for corresponding potential (`.asi` format)
+
+Calculation of extrapolation grades requires matrix-vector
+multiplication for each atom and is slower than the usual `pair_style
+pace recursive`, therefore it is *not* computed by default.
+Extrapolation grade calculation is involved by `fix pair`, which
+requests to compute `gamma`, as shown in example below:
+
+.. code-block:: LAMMPS
+
+    pair_style 	pace/extrapolation
+    pair_coeff  * * Cu.yaml Cu.asi Cu
+
+    fix pace_gamma all pair 10 pace/extrapolation gamma 1
+
+    compute max_pace_gamma all reduce max f_pace_gamma
+    variable dump_skip equal "c_max_pace_gamma < 5"
+
+
+    dump pace_dump all custom 20 extrapolative_structures.dump id x y z f_pace_gamma
+    dump_modify pace_dump skip v_dump_skip
+
+    variable max_pace_gamma equal c_max_pace_gamma
+    fix extreme_extrapolation all halt 10 v_max_pace_gamma > 25
+
+Here extrapolation grade gamma is computed every 10 steps and is stored
+in `f_pace_gamma` per-atom variable.  The largest value of extrapolation
+grade among all atoms in a structure is reduced to `c_max_pace_gamma`
+variable.  Only if this value exceeds extrapolation threshold 5, then
+the structure will be dumped into `extrapolative_structures.dump` file,
+but not more often than every 20 steps.
+
+On all other steps `pair_style pace recursive` will be used.
+
+----------

 See the :doc:`pair_coeff <pair_coeff>` page for alternate ways
 to specify the path for the ACE coefficient file.
@ -90,9 +144,10 @@ specify a pair_coeff command with I != J arguments for this style.
 This pair style does not support the :doc:`pair_modify <pair_modify>`
 shift, table, and tail options.

-This pair style does not write its information to :doc:`binary restart files <restart>`, since it is stored in potential files.  Thus, you
-need to re-specify the pair_style and pair_coeff commands in an input
-script that reads a restart file.
+This pair style does not write its information to :doc:`binary restart
+files <restart>`, since it is stored in potential files.  Thus, you need
+to re-specify the pair_style and pair_coeff commands in an input script
+that reads a restart file.

 This pair style can only be used via the *pair* keyword of the
 :doc:`run_style respa <run_style>` command.  It does not support the
@ -107,19 +162,20 @@ This pair style can only be used via the *pair* keyword of the
 Restrictions
 """"""""""""

-This pair style is part of the ML-PACE package.  It is only enabled if LAMMPS
-was built with that package.
-See the :doc:`Build package <Build_package>` page for more info.
+This pair style is part of the ML-PACE package.  It is only enabled if
+LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.

 Related commands
 """"""""""""""""

-:doc:`pair_style snap  <pair_snap>`
+:doc:`pair_style snap  <pair_snap>`,
+:doc:`fix pair  <fix_pair>`

 Default
 """""""

-recursive, chunksize = 4096
+recursive, chunksize = 4096,

 .. _Drautz20191:

@ -127,4 +183,8 @@ recursive, chunksize = 4096

 .. _Lysogorskiy20211:

-**(Lysogorskiy)** Lysogorskiy, van der Oord, Bochkarev, Menon, Rinaldi, Hammerschmidt, Mrovec, Thompson, Csanyi, Ortner, Drautz, TBD (2021).
+**(Lysogorskiy)** Lysogorskiy, van der Oord, Bochkarev, Menon, Rinaldi, Hammerschmidt, Mrovec, Thompson, Csanyi, Ortner, Drautz, npj Comp Mat, 7, 97 (2021).
+
+.. _Lysogorskiy2022:
+
+**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, TBS (2022).
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@ -313,6 +313,7 @@ accelerated styles exist.
 * :doc:`oxrna2/stk <pair_oxrna2>` -
 * :doc:`oxrna2/xstk <pair_oxrna2>` -
 * :doc:`pace <pair_pace>` - Atomic Cluster Expansion (ACE) machine-learning potential
+* :doc:`pace/extrapolation <pair_pace>` - Atomic Cluster Expansion (ACE) machine-learning potential with extrapolation grades
 * :doc:`peri/eps <pair_peri>` - peridynamic EPS potential
 * :doc:`peri/lps <pair_peri>` - peridynamic LPS potential
 * :doc:`peri/pmb <pair_peri>` - peridynamic PMB potential
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -78,6 +78,7 @@ Alexey
 ali
 aliceblue
 Allinger
+allocatable
 allocator
 allocators
 allosws
@ -599,6 +600,7 @@ Cummins
 Cundall
 cundall
 Curk
+curv
 Cusentino
 customIDs
 cutbond
@ -657,6 +659,7 @@ Dcut
 de
 dE
 De
+deallocate
 deallocated
 debye
 Debye
@ -691,6 +694,7 @@ dequidt
 Dequidt
 der
 dereference
+dereferenced
 derekt
 Deresiewicz
 Derjagin
@ -1486,6 +1490,7 @@ interfacial
 interial
 interlayer
 intermolecular
+interoperable
 Interparticle
 interstitials
 intertube
@ -2387,6 +2392,7 @@ Nmols
 nn
 nnodes
 npits
+npj
 nO
 Nocedal
 nocite
@ -3163,6 +3169,7 @@ sfree
 Sg
 Shan
 Shanno
+Shapeev
 shapex
 shapey
 shapez
@ -3619,6 +3626,7 @@ Universite
 unix
 unmaintained
 unoptimized
+unordered
 unpadded
 unphysical
 unphysically
--- a/examples/PACKAGES/dielectric/README
+++ b/examples/PACKAGES/dielectric/README
@ -2,7 +2,7 @@ This folder contains some example data and input scripts for the DIELECTRIC pack

 Nguyen TD, Li H, Bagchi D, Solis FJ, Olvera de la Cruz, Incorporating surface polarization effects into large-scale coarse-grained molecular dynamics simulation, Computer Physics Communications 2019, 241, 80--91.

- data.confined  :  two point opposite charges confined between two interfaces (epsilon1=2/epsilon2=10/epsilon2=2)
+- data.confined  :  two point opposite charges confined between two interfaces (epsilon2=2/epsilon1=10/epsilon2=2)
 - data.sphere    :  two point opposite charges outside a spherical interface (epsilon_in=1/epsilon2=10)

 - in.confined    :  read in data.confined
@ -10,7 +10,7 @@ Nguyen TD, Li H, Bagchi D, Solis FJ, Olvera de la Cruz, Incorporating surface po

 For "atom_style  dielectric" the Atoms section in the data file contains 15 following columns:

-id mol type q x y z normx normy normz area_per_patch ed em epsilon curvature
+id mol type q x y z normx normy normz area/patch ed em epsilon curvature

 where 

@ -34,9 +34,13 @@ where
  For interface particles, epsilon is set to be em
    (the mean dielectric value above).

-* area_per_patch: the surface area of the patch (element).
+* area/patch: the surface area of the patch (element).
  For real charges, this value is irrelevant, can be 1.0.

 * curvature: surface mean curvature at the patch.
  For example, for spherical interfaces, curvature = 1/spherical radius.
  For planar interfaces, curvature = 0.
+
+Note that the properties normx, normy, normz, area/patch, ed, em, and curvature are not 
+used for the non-interface beads. epsilon is used to scale the charge of any non-interface
+ion, see the documentation for pair styles with the dielectric suffix and fix polarize.
--- a/examples/PACKAGES/dielectric/in.confined
+++ b/examples/PACKAGES/dielectric/in.confined
@ -7,7 +7,7 @@
 # Dielectric constants can be set to be different from the input data file

 variable epsilon1 index 20
-variable epsilon2 index 8
+variable epsilon2 index 10 

 variable    data index data.confined

--- a/fortran/README
+++ b/fortran/README
@ -1,9 +1,9 @@
-This directory contains Fortran code which interface LAMMPS as a library
-and allows the LAMMPS library interface to be invoked from Fortran codes.
-It requires a Fortran compiler that supports the Fortran 2003 standard.
+This directory contains Fortran code that acts as an interface to LAMMPS as a
+library and allows the LAMMPS library interface to be invoked from Fortran
+code.  It requires a Fortran compiler that supports the Fortran 2003 standard.

 This interface is based on and supersedes the previous Fortran interfaces
-in the examples/COUPLE/fortran* folders, but is fully supported by the
+in the examples/COUPLE/fortran* folders, but it is fully supported by the
 LAMMPS developers and included in the documentation and unit testing.

 Details on this Fortran interface and how to build programs using it
--- a/fortran/lammps.f90
+++ b/fortran/lammps.f90
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -1,5 +1,5 @@
-# /* ----------------------------------------------------------------------
-#  Generic Linux Makefile for OpenCL
+# /* ----------------------------------------------------------------------   
+#  Linux Makefile for Intel oneAPI - Mixed precision
 # ------------------------------------------------------------------------- */

 # which file will be copied to Makefile.lammps
@ -11,11 +11,14 @@ EXTRAMAKE = Makefile.lammps.opencl

 LMP_INC = -DLAMMPS_SMALLBIG

-OCL_INC =
-OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd  -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
-OCL_LINK = -lOpenCL
+OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
+          -qoverride-limits
+OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
+          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
+OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT

 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -264,6 +264,20 @@ GERYON_KERNEL_DUMP      Dump all compiled OpenCL programs with compiler
                        flags and build logs
 GPU_CAST                Casting performed on GPU, untested recently
 THREE_CONCURRENT        Concurrent 3-body calcs in separate queues, untested
+LAL_SERIALIZE_INIT      Force serialization of initialization and compilation
+                        for multiple MPI tasks sharing the same accelerator.
+                        Some accelerator API implementations have had issues
+                        with temporary file conflicts in the past.
+GERYON_FORCE_SHARED_MAIN_MEM_ON      Should only be used for builds where the
+                                     accelerator is guaranteed to share physical
+                                     main memory with the host (e.g. integrated
+                                     GPU or CPU device). Default behavior is to
+                                     auto-detect. Impacts OpenCL only.
+GERYON_FORCE_SHARED_MAIN_MEM_OFF     Should only be used for builds where the
+                                     accelerator is guaranteed to have discrete
+                                     physical main memory vs the host (discrete
+                                     GPU card). Default behavior is to
+                                     auto-detect. Impacts OpenCL only.


 ------------------------------------------------------------------------------
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -126,10 +126,13 @@ class UCL_Device {
  /// Return the number of devices that support OpenCL
  inline int num_devices() { return _num_devices; }

-  /// Specify whether profiling (device timers) will be used for the device (yes=true)
+  /// Specify whether profiling (device timers) will be used (yes=true)
  /** No-op for CUDA and HIP **/
-  inline void configure_profiling(const bool profiling_on)
-    { _cq_profiling = profiling_on; }
+  inline void configure_profiling(const bool profiling_on) {
+    #ifndef GERYON_NO_OCL_MARKERS
+    _cq_profiling = profiling_on;
+    #endif
+  }

  /// Set the OpenCL device to the specified device number
  /** A context and default command queue will be created for the device *
@ -176,8 +179,8 @@ class UCL_Device {

 #ifdef CL_VERSION_2_0
    if (_cq_profiling) {
-      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
-                                     0};
+      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES,
+                                     CL_QUEUE_PROFILING_ENABLE, 0};
      _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
                                                    &errorv);
    } else {
@ -187,8 +190,8 @@ class UCL_Device {
    }
 #else
    if (_cq_profiling)
-      _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE,
-                                      &errorv);
+      _cq.back()=clCreateCommandQueue(_context, _cl_device,
+                                      CL_QUEUE_PROFILING_ENABLE, &errorv);
    else
      _cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv);
 #endif
@ -403,7 +406,11 @@ class UCL_Device {
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
  _device=-1;
+  #ifndef GERYON_NO_OCL_MARKERS
  _cq_profiling=true;
+  #else
+  _cq_profiling=false;
+  #endif

  // --- Get Number of Platforms
  cl_uint nplatforms;
@ -482,6 +489,7 @@ int UCL_Device::set_platform(int pid) {
  _num_devices = 0;
  for (int i=0; i<num_unpart; i++) {
    cl_uint num_subdevices = 1;
+    cl_device_id *subdevice_list = device_list + i;

    #ifdef CL_VERSION_1_2
    cl_device_affinity_domain adomain;
@ -494,25 +502,29 @@ int UCL_Device::set_platform(int pid) {
    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
    props[2]=0;
+
+    cl_int err = CL_SUCCESS;
    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
-      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
-                                      &num_subdevices));
-    if (num_subdevices > 1) {
-      cl_device_id *subdevice_list = new cl_device_id[num_subdevices];
-      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
-                                      subdevice_list, &num_subdevices));
-      for (cl_uint j=0; j<num_subdevices; j++) {
-        _cl_devices.push_back(device_list[i]);
-        add_properties(device_list[i]);
-        _num_devices++;
+      err = clCreateSubDevices(device_list[i], props, 0, NULL,
+                               &num_subdevices);
+    if (err == CL_SUCCESS && num_subdevices > 1) {
+      subdevice_list = new cl_device_id[num_subdevices];
+      err = clCreateSubDevices(device_list[i], props, num_subdevices,
+                               subdevice_list, &num_subdevices);
+      if (err != CL_SUCCESS) {
+        delete[] subdevice_list;
+        num_subdevices = 1;
+        subdevice_list = device_list + i;
      }
-      delete[] subdevice_list;
-    } else {
-      _cl_devices.push_back(device_list[i]);
-      add_properties(device_list[i]);
-      _num_devices++;
    }
    #endif
+
+    for (cl_uint j=0; j<num_subdevices; j++) {
+      _num_devices++;
+      _cl_devices.push_back(subdevice_list[j]);
+      add_properties(subdevice_list[j]);
+    }
+    if (num_subdevices > 1) delete[] subdevice_list;
  } // for i
  #endif

@ -686,10 +698,10 @@ void UCL_Device::add_properties(cl_device_id device_list) {
    double arch = static_cast<double>(minor)/10+major;
    if (arch >= 3.0)
      op.has_shuffle_support=true;
-    op.shared_main_memory=_shared_mem_device(device_list);
  }
  delete[] buffer2;
  #endif
+  op.shared_main_memory=_shared_mem_device(device_list);

  _properties.push_back(op);
 }
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -27,11 +27,15 @@
 #include "ocl_macros.h"
 #include "ocl_device.h"

+#ifndef GERYON_NO_OCL_MARKERS
 #ifdef CL_VERSION_1_2
 #define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,nullptr,event)
 #else
 #define UCL_OCL_MARKER clEnqueueMarker
 #endif
+#else
+#define UCL_OCL_MARKER(cq,event)
+#endif

 namespace ucl_opencl {

@ -51,8 +55,10 @@ class UCL_Timer {
  inline void clear() {
    if (_initialized) {
      if (has_measured_time) {
+        #ifndef GERYON_NO_OCL_MARKERS
        clReleaseEvent(start_event);
        clReleaseEvent(stop_event);
+        #endif
        has_measured_time = false;
      }
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
@ -76,8 +82,10 @@ class UCL_Timer {
  /// Start timing on default command queue
  inline void start() {
    if (has_measured_time) {
+      #ifndef GERYON_NO_OCL_MARKERS
      clReleaseEvent(start_event);
      clReleaseEvent(stop_event);
+      #endif
      has_measured_time = false;
    }
    UCL_OCL_MARKER(_cq,&start_event);
@ -91,17 +99,26 @@ class UCL_Timer {

  /// Block until the start event has been reached on device
  inline void sync_start() {
+    #ifndef GERYON_NO_OCL_MARKERS
+    CL_SAFE_CALL(clWaitForEvents(1,&start_event));
    if (has_measured_time) {
      clReleaseEvent(start_event);
      clReleaseEvent(stop_event);
      has_measured_time = false;
    }
-    CL_SAFE_CALL(clWaitForEvents(1,&start_event));
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    has_measured_time = false;
+    #endif
  }

  /// Block until the stop event has been reached on device
  inline void sync_stop() {
+    #ifndef GERYON_NO_OCL_MARKERS
    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    #endif
    has_measured_time = true;
  }

@ -126,6 +143,7 @@ class UCL_Timer {
  /// Return the time (ms) of last start to stop - Forces synchronization
  inline double time() {
    if(!has_measured_time) return 0.0;
+    #ifndef GERYON_NO_OCL_MARKERS
    cl_ulong tstart,tend;
    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
    CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
@ -138,6 +156,11 @@ class UCL_Timer {
    clReleaseEvent(stop_event);
    has_measured_time = false;
    return (tend-tstart)*1e-6;
+    #else
+    CL_SAFE_CALL(clFinish(_cq));
+    has_measured_time = false;
+    return 0.0;
+    #endif
  }

  /// Return the time (s) of last start to stop - Forces synchronization
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -76,7 +76,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                        special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    BLMF.device->gpu_barrier();
+    BLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@ -84,7 +84,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

-    BCLCSMF.device->gpu_barrier();
+    BCLCSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -84,7 +84,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

-    BORNCLMF.device->gpu_barrier();
+    BORNCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@ -86,7 +86,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);

-    BornCWCST.device->gpu_barrier();
+    BornCWCST.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -86,7 +86,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);

-    BORNCWMF.device->gpu_barrier();
+    BORNCWMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -80,7 +80,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                          offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);

-    BORNMF.device->gpu_barrier();
+    BORNMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -114,7 +114,7 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
      BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                    host_born3, host_a, host_c, host_d, offset);

-    BORNMF.device->gpu_barrier();
+    BORNMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@ -83,7 +83,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);

-    BUCKCMF.device->gpu_barrier();
+    BUCKCMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@ -82,7 +82,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    BUCKCLMF.device->gpu_barrier();
+    BUCKCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -77,7 +77,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);

-    BUCKMF.device->gpu_barrier();
+    BUCKMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -110,7 +110,7 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
      BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                    host_a, host_c, offset);

-    BUCKMF.device->gpu_barrier();
+    BUCKMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_charmm_ext.cpp
+++ b/lib/gpu/lal_charmm_ext.cpp
@ -88,7 +88,7 @@ int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                          qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
                          denom_coul, epsilon, sigma, mix_arithmetic);

-    CRMMF.device->gpu_barrier();
+    CRMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -86,7 +86,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
                          sigma, mix_arithmetic);

-    CRMLMF.device->gpu_barrier();
+    CRMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@ -83,7 +83,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);

-    COLLMF.device->gpu_barrier();
+    COLLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@ -74,7 +74,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);

-    CDEMF.device->gpu_barrier();
+    CDEMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -103,7 +103,7 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CDEMF.reinit(ntypes, host_scale);

-    CDEMF.device->gpu_barrier();
+    CDEMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@ -77,7 +77,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
                        gpu_split, screen, host_cut_coulsq, host_special_coul,
                        qqrd2e, e_shift, f_shift, alpha);

-    CDMF.device->gpu_barrier();
+    CDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@ -74,7 +74,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e);

-    COULMF.device->gpu_barrier();
+    COULMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -103,7 +103,7 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      COULMF.reinit(ntypes, host_scale);

-    COULMF.device->gpu_barrier();
+    COULMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_coul_long_cs_ext.cpp
@ -76,7 +76,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
                        cell_size, gpu_split, screen, host_cut_coulsq,
                        host_special_coul, qqrd2e, g_ewald);

-    CLCSMF.device->gpu_barrier();
+    CLCSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void clcs_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CLCSMF.reinit(ntypes, host_scale);

-    CLCSMF.device->gpu_barrier();
+    CLCSMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -76,7 +76,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
                        cell_size, gpu_split, screen, host_cut_coulsq,
                        host_special_coul, qqrd2e, g_ewald);

-    CLMF.device->gpu_barrier();
+    CLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) {
    if (gpu_rank==i && world_me!=0)
      CLMF.reinit(ntypes, host_scale);

-    CLMF.device->gpu_barrier();
+    CLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -328,7 +328,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
  for (int i=0; i<_procs_per_gpu; i++) {
    if (_gpu_rank==i)
      flag=compile_kernels();
-    gpu_barrier();
+    serialize_init();
  }

  // check if double precision support is available
@ -609,6 +609,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
    int last=last_gpu+1;
    if (last>gpu->num_devices())
      last=gpu->num_devices();
+    if (gpu->num_platforms()>1) {
+      std::string pname=gpu->platform_name();
+      fprintf(screen,"Platform: %s\n",pname.c_str());
+    }
    for (int i=first_gpu; i<last; i++) {
      std::string sname;
      if (i==first_gpu)
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -217,6 +217,12 @@ class Device {
  inline int gpu_rank() const { return _gpu_rank; }
  /// MPI Barrier for gpu
  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
+  /// Serialize GPU initialization and JIT for unsafe platforms
+  inline void serialize_init() {
+    #ifdef LAL_SERIALIZE_INIT
+    gpu_barrier();
+    #endif
+  }
  /// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
  inline int gpu_mode() const { return _gpu_mode; }
  /// Index of first device used by a node
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@ -80,7 +80,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);

-    DPLMF.device->gpu_barrier();
+    DPLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@ -80,7 +80,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e);

-    DPLSFMF.device->gpu_barrier();
+    DPLSFMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dipole_long_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_long_lj_ext.cpp
@ -81,7 +81,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    DPLJMF.device->gpu_barrier();
+    DPLJMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@ -76,7 +76,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                         host_cut, special_lj, false, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);

-    DPDMF.device->gpu_barrier();
+    DPDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@ -76,7 +76,7 @@ int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                         host_cut, special_lj, true, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);

-    DPDTMF.device->gpu_barrier();
+    DPDTMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@ -90,7 +90,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMALMF.device->gpu_barrier();
+    EAMALMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@ -90,7 +90,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMMF.device->gpu_barrier();
+    EAMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@ -90,7 +90,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    EAMFSMF.device->gpu_barrier();
+    EAMFSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@ -76,7 +76,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    GLMF.device->gpu_barrier();
+    GLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
    if (gpu_rank==i && world_me!=0)
      GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);

-    GLMF.device->gpu_barrier();
+    GLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -83,7 +83,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
                        max_nbors, maxspecial, cell_size, gpu_split,  screen);

-    GBMF.device->gpu_barrier();
+    GBMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@ -76,7 +76,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          offset, special_lj, inum,  nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);

-    LJ96MF.device->gpu_barrier();
+    LJ96MF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@ -81,7 +81,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    C2CLMF.device->gpu_barrier();
+    C2CLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@ -81,7 +81,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, kappa);

-    LJCDMF.device->gpu_barrier();
+    LJCDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@ -80,7 +80,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);

-    LJCMF.device->gpu_barrier();
+    LJCMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@ -81,7 +81,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    LJCLMF.device->gpu_barrier();
+    LJCLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -112,7 +112,7 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                    offset, host_cut_ljsq);
-    LJCLMF.device->gpu_barrier();
+    LJCLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@ -83,7 +83,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, order, qqrd2e);

-    LJCMLMF.device->gpu_barrier();
+    LJCMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@ -80,7 +80,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                              special_lj, inum, nall, max_nbors, maxspecial,
                              cell_size, gpu_split, screen);

-    LJCubicLMF.device->gpu_barrier();
+    LJCubicLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@ -84,7 +84,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                         f_shift, alpha);

-    LJDMF.device->gpu_barrier();
+    LJDMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
@ -81,7 +81,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

-    LJECLMF.device->gpu_barrier();
+    LJECLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -112,7 +112,7 @@ void ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJECLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                    offset, shift, host_cut_ljsq);
-    LJECLMF.device->gpu_barrier();
+    LJECLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@ -108,7 +108,7 @@ void lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
    if (gpu_rank==i && world_me!=0)
      LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                   offset, shift);
-    LJEMF.device->gpu_barrier();
+    LJEMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@ -76,7 +76,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    LJLMF.device->gpu_barrier();
+    LJLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -105,7 +105,7 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
-    LJLMF.device->gpu_barrier();
+    LJLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@ -81,7 +81,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                           host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);

-    LJGRMMF.device->gpu_barrier();
+    LJGRMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_smooth_ext.cpp
+++ b/lib/gpu/lal_lj_smooth_ext.cpp
@ -80,7 +80,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                         cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
                         host_ljsw4, cut_inner, cut_inner_sq);

-    LJSMTMF.device->gpu_barrier();
+    LJSMTMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -110,7 +110,7 @@ void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
-    LJSMTMF.device->gpu_barrier();
+    LJSMTMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_lj_spica_ext.cpp
+++ b/lib/gpu/lal_lj_spica_ext.cpp
@ -77,7 +77,7 @@ int spica_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                         host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);

-    CMMMF.device->gpu_barrier();
+    CMMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_spica_long_ext.cpp
+++ b/lib/gpu/lal_lj_spica_long_ext.cpp
@ -81,7 +81,7 @@ int spical_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                          maxspecial, cell_size, gpu_split, screen,
                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
                          qqrd2e, g_ewald);
-    CMMLMF.device->gpu_barrier();
+    CMMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_lj_tip4p_long_ext.cpp
+++ b/lib/gpu/lal_lj_tip4p_long_ext.cpp
@ -89,7 +89,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
          host_special_coul, qqrd2e,
          g_ewald, map_size, max_same);

-    LJTIP4PLMF.device->gpu_barrier();
+    LJTIP4PLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@ -80,7 +80,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    MLMF.device->gpu_barrier();
+    MLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@ -77,7 +77,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    MORMF.device->gpu_barrier();
+    MORMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@ -81,7 +81,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                           split,success);

-    pppm.device->gpu_barrier();
+    pppm.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -80,7 +80,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
                        host_lj4, offset, special_lj,  inum, nall,
                        max_nbors, maxspecial, cell_size, gpu_split, screen);

-    REMF.device->gpu_barrier();
+    REMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@ -76,7 +76,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                        special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

-    SLMF.device->gpu_barrier();
+    SLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
    if (gpu_rank==i && world_me!=0)
      SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);

-    SLMF.device->gpu_barrier();
+    SLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@ -84,7 +84,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall,
                        sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
                        costheta, map, e2param);

-    SWMF.device->gpu_barrier();
+    SWMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@ -76,7 +76,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                      gpu_split, screen, tabstyle, ntables, tablength);

-    TBMF.device->gpu_barrier();
+    TBMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_ext.cpp
+++ b/lib/gpu/lal_tersoff_ext.cpp
@ -91,7 +91,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
                        ts_c1, ts_c2, ts_c3, ts_c4, ts_c, ts_d, ts_h,
                        ts_gamma, ts_beta, ts_powern, ts_cutsq);

-    TSMF.device->gpu_barrier();
+    TSMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_mod_ext.cpp
+++ b/lib/gpu/lal_tersoff_mod_ext.cpp
@ -91,7 +91,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
                        ts_c3, ts_c4, ts_c5, ts_h, ts_beta, ts_powern,
                        ts_powern_del, ts_ca1, ts_cutsq);

-    TSMMF.device->gpu_barrier();
+    TSMMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_tersoff_zbl_ext.cpp
+++ b/lib/gpu/lal_tersoff_zbl_ext.cpp
@ -102,7 +102,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
                        ts_ZBLcut, ts_ZBLexpscale, global_e, global_a_0,
                        global_epsilon_0, ts_cutsq);

-    TSZMF.device->gpu_barrier();
+    TSZMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_ufm_ext.cpp
+++ b/lib/gpu/lal_ufm_ext.cpp
@ -78,7 +78,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);

-    UFMLMF.device->gpu_barrier();
+    UFMLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
@ -106,7 +106,7 @@ void ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
-    UFMLMF.device->gpu_barrier();
+    UFMLMF.device->serialize_init();
  }
 }

--- a/lib/gpu/lal_vashishta_ext.cpp
+++ b/lib/gpu/lal_vashishta_ext.cpp
@ -89,7 +89,7 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                        c0, costheta, bigb, big2b, bigc);

-    VashishtaMF.device->gpu_barrier();
+    VashishtaMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@ -76,7 +76,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
                            inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                            screen, kappa);

-    YKCOLLMF.device->gpu_barrier();
+    YKCOLLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@ -76,7 +76,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
                      inum, nall, max_nbors, maxspecial, cell_size,
                      gpu_split, screen);

-    YKMF.device->gpu_barrier();
+    YKMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@ -79,7 +79,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                         cut_globalsq, cut_innersq, cut_inner,
                         inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);

-    ZBLMF.device->gpu_barrier();
+    ZBLMF.device->serialize_init();
    if (message)
      fprintf(screen,"Done.\n");
  }
--- a/lib/kokkos/BUILD.md
+++ b/lib/kokkos/BUILD.md
@ -27,7 +27,7 @@ When configuring your project just set:
  -DKokkos_ROOT=${kokkos_install_prefix} \
  -DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos}
 ````
-Note: You may need the following if using some versions of CMake (e.g. 3.12):
+Note: You may need the following if your project requires a minimum CMake version older than 3.12:
 ````cmake
 cmake_policy(SET CMP0074 NEW)
 ````
@ -171,6 +171,9 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
 * Kokkos_ENABLE_HPX_ASYNC_DISPATCH
    * Whether HPX supports asynchronous dispatch
    * BOOL Default: OFF
+* Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+    * Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details.
+    * BOOL Default: OFF
 * Kokkos_ENABLE_LARGE_MEM_TESTS
    * Whether to perform extra large memory tests
    * BOOL_Default: OFF
@ -235,6 +238,9 @@ The following options control `find_package` paths for CMake-based TPLs:
 ## Architecture Keywords
 Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`.

+* Kokkos_ARCH_NATIVE
+    * Whether to optimize for the the local CPU architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_AMDAVX
    * Whether to optimize for the AMDAVX architecture
    * BOOL Default: OFF
@ -310,12 +316,24 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_
 * Kokkos_ARCH_POWER9
    * Whether to optimize for the POWER9 architecture
    * BOOL Default: OFF
+* Kokkos_ARCH_ICL
+    * Whether to optimize for the ICL architecture
+    * BOOL Default: OFF
+* Kokkos_ARCH_ICX
+    * Whether to optimize for the ICX architecture
+    * BOOL Default: OFF
+* Kokkos_ARCH_SKL
+    * Whether to optimize for the SKL architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_SKX
    * Whether to optimize for the SKX architecture
    * BOOL Default: OFF
 * Kokkos_ARCH_SNB
    * Whether to optimize for the SNB architecture
    * BOOL Default: OFF
+* Kokkos_ARCH_SPR
+    * Whether to optimize for the SPR architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_TURING75
    * Whether to optimize for the TURING75 architecture
    * BOOL Default: OFF
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,157 @@
 # Change Log

+## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
+
+### Features:
+- Use non-volatile `join()` member functions and `operator+=` in `parallel_reduce/scan` [\#4931](https://github.com/kokkos/kokkos/pull/4931) [\#4954](https://github.com/kokkos/kokkos/pull/4954) [\#4951](https://github.com/kokkos/kokkos/pull/4951)
+- Add `SIMD` sub package (requires C++17) [\#5016](https://github.com/kokkos/kokkos/pull/5016)
+- Add `is_finalized()` [\#5247](https://github.com/kokkos/kokkos/pull/5247)
+- Promote mathematical functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#4791](https://github.com/kokkos/kokkos/pull/4791)
+- Promote `min`, `max`, `clamp`, `minmax` functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#5170](https://github.com/kokkos/kokkos/pull/5170)
+- Add `round`, `logb`, `nextafter`, `copysign`, and `signbit` math functions [\#4768](https://github.com/kokkos/kokkos/pull/4768)
+- Add `HIPManagedSpace`, similar to `CudaUVMSpace` [\#5112](https://github.com/kokkos/kokkos/pull/5112)
+- Accept view construction allocation properties in `create_mirror[_view,_view_and_copy]` and `resize/realloc` [\#5125](https://github.com/kokkos/kokkos/pull/5125) [\#5095](https://github.com/kokkos/kokkos/pull/5095) [\#5035](https://github.com/kokkos/kokkos/pull/5035) [\#4805](https://github.com/kokkos/kokkos/pull/4805) [\#4844](https://github.com/kokkos/kokkos/pull/4844)
+- Allow `MemorySpace::allocate()` to be called with execution space [\#4826](https://github.com/kokkos/kokkos/pull/4826)
+- Experimental: Compile time view subscriber [\#4197](https://github.com/kokkos/kokkos/pull/4197)
+
+### Backends and Archs Enhancements:
+- Add support for Sapphire Rapids Intel architecture [\#5015](https://github.com/kokkos/kokkos/pull/5015)
+- Add support for ICX, SKL and ICL Intel architectures [\#5013](https://github.com/kokkos/kokkos/pull/5013) [\#4929](https://github.com/kokkos/kokkos/pull/4929)
+- Add arch flags for Intel GPU Ponte Vecchio [\#4932](https://github.com/kokkos/kokkos/pull/4932)
+- SYCL: require GPU if GPU architecture was set at configuration time (i.e. do not allow fallback to CPU device) [\#5264](https://github.com/kokkos/kokkos/pull/5264) [\#5222](https://github.com/kokkos/kokkos/pull/5222)
+- SYCL: Add `SYCL::sycl_queue()` for interoperability [\#5241](https://github.com/kokkos/kokkos/pull/5241)
+- SYCL: Loosen restriction for using built-in `sycl::group_broadcast` [\#4552](https://github.com/kokkos/kokkos/pull/4552)
+- SYCL: preserve address space [\#4396](https://github.com/kokkos/kokkos/pull/4396)
+- OpenMPTarget: Adding a workaound for team scan [\#5219](https://github.com/kokkos/kokkos/pull/5219)
+- OpenMPTarget: Adding logic to skip the kernel launch if `league_size=0` [\#5067](https://github.com/kokkos/kokkos/pull/5067)
+- OpenMPTarget: Make sure `Kokkos::abort()` causes abnormal program termination when called on the host-side [\#4808](https://github.com/kokkos/kokkos/pull/4808)
+- HIP: Make HIPHostPinnedSpace coarse-grained [\#5152](https://github.com/kokkos/kokkos/pull/5152)
+- Refactor OpenMP `parallel_for` implementation to use more native OpenMP constructs [\#4664](https://github.com/kokkos/kokkos/pull/4664)
+- Add option to optimize for local CPU architecture `Kokkos_ARCH_NATIVE` [\#4930](https://github.com/kokkos/kokkos/pull/4930)
+
+
+### Implemented enhancements
+- Add command line argument/environment variable to print the configuration [\#5233](https://github.com/kokkos/kokkos/pull/5233)
+- Improve error message in view memory access violations [\#4950](https://github.com/kokkos/kokkos/pull/4950)
+- Remove unnecessary fences in View initialization [\#4823](https://github.com/kokkos/kokkos/pull/4823)
+- Make `View::shmem_size()` device-callable [\#4936](https://github.com/kokkos/kokkos/pull/4936)
+- Update numerics support for `__float128` [\#5081](https://github.com/kokkos/kokkos/pull/5081)
+- Add `log10` overload for `Kokkos::complex` [\#5009](https://github.com/kokkos/kokkos/pull/5009)
+- Add `[[nodiscard]]` to `ScopeGuard` [\#5224](https://github.com/kokkos/kokkos/pull/5224)
+- Add structured binding support for `Kokkos::Array` [\#4962](https://github.com/kokkos/kokkos/pull/4962)
+- Enable accessing `Kokkos::Array` elements in constant expressions [\#4916](https://github.com/kokkos/kokkos/pull/4916)
+- Mark `as_view_of_rank_n` as KOKKOS_FUNCTION [\#5248](https://github.com/kokkos/kokkos/pull/5248)
+- Cleanup/rework fence overloads [\#5148](https://github.com/kokkos/kokkos/pull/5148)
+- Assert that `Layout` construction from extents is valid in functions taking integer extents [\#5209](https://github.com/kokkos/kokkos/pull/5209)
+- Add `fill_random` overload that takes an execution space as first argument [\#5181](https://github.com/kokkos/kokkos/pull/5181)
+- Avoid some unnecessary fences in `parallel_reduce/scan` [\#5154](https://github.com/kokkos/kokkos/pull/5154)
+- Include `KOKKOS_ENABLE_LIBDL` in options when printing configuration [\#5086](https://github.com/kokkos/kokkos/pull/5086)
+- DynRankView: make `layout()` return the same as a corresponding static View [\#5026](https://github.com/kokkos/kokkos/pull/5026)
+- Use `_mm_malloc` for icpx [\#5012](https://github.com/kokkos/kokkos/pull/5012)
+- Avoid forcing matching execution spaces in `BinSort` constructor and `sort()` [\#4919](https://github.com/kokkos/kokkos/pull/4919)
+- Check number of bins in `BinSort` [\#4890](https://github.com/kokkos/kokkos/pull/4890)
+- Improve performance in parallel STL-like algorithms [\#4887](https://github.com/kokkos/kokkos/pull/4887) [\#4886](https://github.com/kokkos/kokkos/pull/4886)
+- Disable `memset` on A64FX and launch `parallel_for` instead (performance) [\#4884](https://github.com/kokkos/kokkos/pull/4884)
+- Allow non-power-of-two team sizes for team reductions and scans [\#4809](https://github.com/kokkos/kokkos/pull/4809)
+
+#### Harmonization of Kokkos execution environment initialization:
+- Warn when unable to detect local MPI rank and user explicitly asked for it [\#5263](https://github.com/kokkos/kokkos/pull/5263)
+- Refactor parsing of command line arguments and environment variables [\#5221](https://github.com/kokkos/kokkos/pull/5221)
+- Refactor device selection at initialization [\#5211](https://github.com/kokkos/kokkos/pull/5211)
+- Rename tools settings for consistency [\#5201](https://github.com/kokkos/kokkos/pull/5201)
+- Print help only once [\#5128](https://github.com/kokkos/kokkos/pull/5128)
+- Update precedence rule in initialization [\#5130](https://github.com/kokkos/kokkos/pull/5130)
+- Warn instead of just ignoring user settings when kokkos-tools is disabled [\#5088](https://github.com/kokkos/kokkos/pull/5088)
+- Drop numa args in threads backend initialization [\#5127](https://github.com/kokkos/kokkos/pull/5127)
+- Warn users when a flag prefixed with -[-]kokkos is not recognized and do not remove it [\#5256](https://github.com/kokkos/kokkos/pull/5256)
+- Give back to Core what belongs to Core (aka moving tune_internals option from Tools back to Core) [\#5202](https://github.com/kokkos/kokkos/pull/5202)
+
+#### Build system updates:
+- `nvcc_wrapper`: filter out -pedantic-errors from nvcc options [\#5235](https://github.com/kokkos/kokkos/pull/5235)
+- `nvcc_wrapper`: add known nvcc option --source-in-ptx [\#5052](https://github.com/kokkos/kokkos/pull/5052)
+- Link libdl as interface library [\#5179](https://github.com/kokkos/kokkos/pull/5179)
+- Only show GPU architectures with enabled corresponding backend [\#5119](https://github.com/kokkos/kokkos/pull/5119)
+- Enable optional external desul build [\#5021](https://github.com/kokkos/kokkos/pull/5021) [\#5132](https://github.com/kokkos/kokkos/pull/5132)
+- Export `Kokkos_CXX_STANDARD` variable with CMake [\#5068](https://github.com/kokkos/kokkos/pull/5068)
+- Suppress warnings with nvc++ [\#5031](https://github.com/kokkos/kokkos/pull/5031)
+- Disallow multiple host architectures in CMake [\#4996](https://github.com/kokkos/kokkos/pull/4996)
+- Do not include compiler warning flags in the compile option of the cmake target [\#4989](https://github.com/kokkos/kokkos/pull/4989)
+- AOT flags for OpenMPTarget targeting Intel GPUs [\#4915](https://github.com/kokkos/kokkos/pull/4915)
+- Repurpose `Kokkos_ARCH_INTEL_GEN` for SYCL to mean JIT to be conforming with OMPT [\#4894](https://github.com/kokkos/kokkos/pull/4894)
+- Replace amdgpu-target with offload-arch [\#4874](https://github.com/kokkos/kokkos/pull/4874)
+- Do not enable `kokkos_launch_compiler` when `CMAKE_CXX_COMPILER_LAUNCHER` is set [\#4870](https://github.com/kokkos/kokkos/pull/4870)
+- Move CMake version check up [\#4797](https://github.com/kokkos/kokkos/pull/4797)
+
+### Incompatibilities:
+- Remove `KOKKOS_THREAD_LOCAL` [\#5064](https://github.com/kokkos/kokkos/pull/5064)
+- Remove `KOKKOS_ENABLE_POSIX_MEMALIGN` [\#5011](https://github.com/kokkos/kokkos/pull/5011)
+- Remove unused `KOKKOS_ENABLE_TM` [\#4995](https://github.com/kokkos/kokkos/pull/4995)
+- Remove unused cmakedefine `KOKKOS_ENABLE_COMPILER_WARNINGS` [\#4883](https://github.com/kokkos/kokkos/pull/4883)
+- Remove unused `KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK` [\#4882](https://github.com/kokkos/kokkos/pull/4882)
+- Drop Instruction Set Architecture (ISA) macros [\#4981](https://github.com/kokkos/kokkos/pull/4981)
+- Warn in `ScopeGuard` about illegal usage [\#5250](https://github.com/kokkos/kokkos/pull/5250)
+
+### Deprecations:
+- Guard against non-public header inclusion [\#5178](https://github.com/kokkos/kokkos/pull/5178)
+- Raise deprecation warnings if non empty WorkTag class is used [\#5230](https://github.com/kokkos/kokkos/pull/5230)
+- Deprecate `parallel_*` overloads taking the label as trailing argument [\#5141](https://github.com/kokkos/kokkos/pull/5141)
+- Deprecate nested types in functional [\#5185](https://github.com/kokkos/kokkos/pull/5185)
+- Deprecate `InitArguments` struct and replace it with `InitializationSettings` [\#5135](https://github.com/kokkos/kokkos/pull/5135)
+- Deprecate `finalize_all()` [\#5134](https://github.com/kokkos/kokkos/pull/5134)
+- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
+- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
+- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
+- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
+- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
+- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
+- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
+- Warn about `parallel_reduce` cases that call `join()` with volatile-qualified arguments [\#5215](https://github.com/kokkos/kokkos/pull/5215)
+
+### Bug Fixes:
+- CUDA Reductions: Fix data races reported by Nvidia `compute-sanitizer` [\#4855](https://github.com/kokkos/kokkos/pull/4855)
+- Work around Intel compiler bug [\#5301](https://github.com/kokkos/kokkos/pull/5301)
+- Avoid allocating memory for UniqueToken [\#5300](https://github.com/kokkos/kokkos/pull/5300)
+- DynamicView: Properly resize mirror instances after construction [\#5276](https://github.com/kokkos/kokkos/pull/5276)
+- Remove Kokkos::Rank limit of 6 ranks [\#5271](https://github.com/kokkos/kokkos/pull/5271)
+- Do not forget to set last element to nullptr when removing a flag in `Kokkos::initialize` [\#5272](https://github.com/kokkos/kokkos/pull/5272)
+- Fix CUDA+MSVC build issue [\#5261](https://github.com/kokkos/kokkos/pull/5261)
+- Fix `DynamicView::resize_serial` [\#5220](https://github.com/kokkos/kokkos/pull/5220)
+- Fix cmake default compiler flags for unknown compiler [\#5217](https://github.com/kokkos/kokkos/pull/5217)
+- Fix `move_backward` [\#5191](https://github.com/kokkos/kokkos/pull/5191)
+- Fixing issue 5196 - missing symbol with intel compiler [\#5207](https://github.com/kokkos/kokkos/pull/5207)
+- Preserve `KOKKOS_INVALID_INDEX` in ViewDimension and ArrayLayout construction [\#5188](https://github.com/kokkos/kokkos/pull/5188)
+- Finalize `deep_copy_space` early avoiding printing to `std::cerr` for Cuda [\#5151](https://github.com/kokkos/kokkos/pull/5151)
+- Use correct policy in Threads MDRange `parallel_reduce` [\#5123](https://github.com/kokkos/kokkos/pull/5123)
+- Fix building with NVCC as the CXX compiler while the CUDA backend is not enabled [\#5115](https://github.com/kokkos/kokkos/pull/5115)
+- OpenMPTarget Index range fix for MDRange. [\#5089](https://github.com/kokkos/kokkos/pull/5089)
+- Fix bug with CUDA's team reduction for empty ranges [\#5079](https://github.com/kokkos/kokkos/pull/5079)
+- Fix using `ZeroMemset` for Serial [\#5077](https://github.com/kokkos/kokkos/pull/5077)
+- Fix `Kokkos::Vector::push_back` for default execution space [\#5047](https://github.com/kokkos/kokkos/pull/5047)
+- ScatterView: Fix ScatterMin/ScatterMax to use proper atomics [\#5045](https://github.com/kokkos/kokkos/pull/5045)
+- Fix calling `ZeroMemset` in `deep_copy` [\#5040](https://github.com/kokkos/kokkos/pull/5040)
+- Make View self-assignment not produce double-free [\#5024](https://github.com/kokkos/kokkos/pull/5024)
+- Guard against unrecognized pragma with intel compilers [\#5019](https://github.com/kokkos/kokkos/pull/5019)
+- Fix racing condition in `HIPParallelLaunch` [\#5008](https://github.com/kokkos/kokkos/pull/5008)
+- KokkosP: Fix `device_id` in profiling [\#4997](https://github.com/kokkos/kokkos/pull/4997)
+- Fix for `Kokkos::vector::insert` into empty vector with begin and end iterators [\#4988](https://github.com/kokkos/kokkos/pull/4988)
+- Fix Core header files installation [\#4984](https://github.com/kokkos/kokkos/pull/4984)
+- Fix bounds errors with `Kokkos::sort` [\#4980](https://github.com/kokkos/kokkos/pull/4980)
+- Fixup let `RangePolicy::set_chunk_size` return a reference to self [\#4918](https://github.com/kokkos/kokkos/pull/4918)
+- Fix allocating large Views [\#4907](https://github.com/kokkos/kokkos/pull/4907)
+- Fix combined reductions with `Kokkos::View` [\#4896](https://github.com/kokkos/kokkos/pull/4896)
+- Fixed `_CUDA_ARCH__` to `__CUDA_ARCH__` for CUDA LDG [\#4893](https://github.com/kokkos/kokkos/pull/4893)
+- Fixup `View::access()` truncate parameter pack [\#4876](https://github.com/kokkos/kokkos/pull/4876)
+- Fix `abort` with HIP backend for ROCm 5.0.2 and beyond [\#4873](https://github.com/kokkos/kokkos/pull/4873)
+- Fix HIP version when printing the configuration [\#4872](https://github.com/kokkos/kokkos/pull/4872)
+- Fix scratch lock array when using scratch level 1 [\#4871](https://github.com/kokkos/kokkos/pull/4871)
+- Fix Makefile.kokkos to work with fujitsu compiler [\#4867](https://github.com/kokkos/kokkos/pull/4867)
+- cmake: Correct link THREADS link option [\#4854](https://github.com/kokkos/kokkos/pull/4854)
+- UniqueToken `impl_acquire` function should be device only [\#4819](https://github.com/kokkos/kokkos/pull/4819)
+- Fix example calls to non existing static `print_configuration` [\#4806](https://github.com/kokkos/kokkos/pull/4806)
+- Fix requests for large team scratch sizes [\#4728](https://github.com/kokkos/kokkos/pull/4728)
+
+
 ## [3.6.01](https://github.com/kokkos/kokkos/tree/3.6.01) (2022-05-23)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.00...3.6.01)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -1,3 +1,4 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)

 # Disable in-source builds to prevent source tree corruption.
 if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
@ -28,11 +29,6 @@ SET(KOKKOS_SRC_PATH      ${Kokkos_SOURCE_DIR})
 SET(KOKKOS_PATH          ${Kokkos_SOURCE_DIR})
 SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})

-# Needed to simplify syntax of if statements
-CMAKE_POLICY(SET CMP0054 NEW)
-# Needed to make IN_LIST a valid operator
-CMAKE_POLICY(SET CMP0057 NEW)
-
 # Is this a build as part of Trilinos?
 IF(COMMAND TRIBITS_PACKAGE_DECL)
  SET(KOKKOS_HAS_TRILINOS ON)
@ -72,7 +68,6 @@ ENDFUNCTION()
 LIST(APPEND CMAKE_MODULE_PATH cmake/Modules)

 IF(NOT KOKKOS_HAS_TRILINOS)
-  cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
  set(CMAKE_DISABLE_SOURCE_CHANGES ON)
  set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)

@ -80,7 +75,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
  # downstream dependencies need to match this!
  SET(KOKKOS_COMPILE_LANGUAGE CXX)
  # use lower case here since we didn't parse options yet
-  IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
+  IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA)

    # Without this as a language for the package we would get a C++ compiler enabled.
    # but we still need a C++ compiler even if we build all our cpp files as CUDA only
@ -90,9 +85,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
    # days.
    SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX)

-    IF (Kokkos_ENABLE_CUDA)
-      SET(KOKKOS_COMPILE_LANGUAGE CUDA)
-    ENDIF()
+    SET(KOKKOS_COMPILE_LANGUAGE CUDA)
  ENDIF()

  IF (Spack_WORKAROUND)
@ -135,14 +128,11 @@ ENDIF()


 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 6)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 7)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

-MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-CMAKE_POLICY(SET CMP0074 NEW)
-
 # Load either the real TriBITS or a TriBITS wrapper
 # for certain utility functions that are universal (like GLOBAL_SET)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake)
@ -204,11 +194,16 @@ KOKKOS_SETUP_BUILD_ENVIRONMENT()
 OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF)

 SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontainers Kokkos::kokkosalgorithms)
-SET(KOKKOS_INT_LIBRARIES kokkos kokkoscore kokkoscontainers kokkosalgorithms)
+SET(KOKKOS_SUB_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms)
+IF (KOKKOS_CXX_STANDARD GREATER_EQUAL 17)
+  LIST(APPEND KOKKOS_EXT_LIBRARIES Kokkos::kokkossimd)
+  LIST(APPEND KOKKOS_SUB_LIBRARIES kokkossimd)
+ENDIF()
+SET(KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_SUB_LIBRARIES})
 SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES})

 IF (KOKKOS_HAS_TRILINOS)
-  SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+  SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
  SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR})
  SET(KOKKOS_IS_SUBDIRECTORY TRUE)
 ELSEIF(HAS_PARENT)
@ -296,7 +291,7 @@ IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
  #Make sure in-tree projects can reference this as Kokkos::
  #to match the installed target names
  ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos)
-  TARGET_LINK_LIBRARIES(kokkos INTERFACE kokkoscore kokkoscontainers kokkosalgorithms)
+  TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_SUB_LIBRARIES})
  KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos)
 ENDIF()
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake)
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -11,21 +11,21 @@ CXXFLAGS += $(SHFLAGS)
 endif

 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 6
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 7
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Threads"
 # Options:
-# Intel:    KNC,KNL,SNB,HSW,BDW,SKX
+# Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
-# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP
+# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@ -193,6 +193,8 @@ KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VE
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
+# TODO fujitsu can emulate gcc or clang. Only clang mode works at the moment.
+KOKKOS_INTERNAL_COMPILER_FUJITSU     := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),FUJITSU)

 # Check Host Compiler if using NVCC through nvcc_wrapper
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -221,8 +223,23 @@ endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
  KOKKOS_INTENAL_COMPILER_CLANG = 0
 endif
+# Fujitsu passes also as clang and gcc respectively
+ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+    # TODO  handle gcc flags and workaround for bug?
+    # fujitsu (gcc mode) is bugged, see https://github.com/kokkos/kokkos/issues/4730
+    $(warning Warning: ${CXX} in Trad Mode '-Nnoclang' (default) is not recommended. Use 'CXX = ${CXX} -Nclang' instead.)
+    # HACK since fujitsu only accepts some gcc flags, disable gcc here?
+    # KOKKOS_INTERNAL_COMPILER_GCC = 0
+  endif
+  # TODO handle clang flags
+  # warnings: works fine as is
+  # openmp: handled
+  #KOKKOS_INTERNAL_COMPILER_CLANG = 0
+endif

 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+  # TODO empty variable if fujitsu (clang mode) passes as clang
  KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell $(CXX) --version | grep version | cut -d ' ' -f3 | tr -d '.')

  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@ -272,7 +289,12 @@ else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
    else
-    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
+      # fujitsu (clang mode) fails with `=libomp`
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+      else
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+      endif
    endif
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
@ -300,11 +322,15 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
 else
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
    #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
-    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp -Wno-openmp-mapping
    KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget
  else
-    #Assume GCC
-    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping
+    else
+      #Assume GCC
+      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+    endif
  endif
 endif

@ -344,8 +370,12 @@ KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
+KOKKOS_INTERNAL_USE_ARCH_SKL := $(call kokkos_has_string,$(KOKKOS_ARCH),SKL)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
+KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL)
+KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX)
+KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR)

 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
@ -353,6 +383,7 @@ KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),
 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC)

 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
@ -436,19 +467,9 @@ KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_W
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
-KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
-
-# Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
-KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
-KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
-KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
-
-# Decide whether we can support transactional memory
-KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))

 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -551,36 +572,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL")
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TM")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_X86_64")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_KNC")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCLE")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCBE")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
 ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
@ -1041,7 +1032,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
  endif
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKL), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xSKYLAKE
+    KOKKOS_LDFLAGS  += -xSKYLAKE
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+      else
+        # Nothing here yet.
+	KOKKOS_CXXFLAGS += -march=skylake
+	KOKKOS_LDFLAGS  += -march=skylake
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKX), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
@ -1055,13 +1067,31 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)

      else
        # Nothing here yet.
-        KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
-        KOKKOS_LDFLAGS  += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
+        KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512
+        KOKKOS_LDFLAGS  += -march=skylake-avx512 -mtune=skylake-avx512
      endif
    endif
  endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICL), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=icelake-client -mtune=icelake-client
+  KOKKOS_LDFLAGS += -march=icelake-client -mtune=icelake-client
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICX), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=icelake-server -mtune=icelake-server
+  KOKKOS_LDFLAGS += -march=icelake-server -mtune=icelake-server
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SPR), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=sapphirerapids -mtune=sapphirerapids
+  KOKKOS_LDFLAGS += -march=sapphirerapids -mtune=sapphirerapids
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC")
  KOKKOS_CXXFLAGS += -mmic
@ -1091,7 +1121,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64 -Xopenmp-target -march
  endif
  KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
 endif
@ -1192,29 +1222,29 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900")
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx900
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
  endif


  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp)
  ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
+    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
  endif

  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
@ -1230,51 +1260,67 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  endif
 endif

-# Figure out the architecture flag for SYCL.
+# Figure out Intel architecture flags.
 ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
-  # Lets start with adding architecture defines
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
-  endif
+  KOKKOS_INTERNAL_LC_BACKEND := sycl
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_INTERNAL_LC_BACKEND := openmp
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen9"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen11"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen12lp"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device dg1"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device xehp"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_PVC")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device 12.4.0"
+endif

+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)

-  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
-  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda -fsycl-dead-args-optimization
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
  KOKKOS_LDFLAGS+=-fsycl
-  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) -D__STRICT_ANSI__
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
 endif

 ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+  KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include
 else
  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
    $(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) )
@ -1359,7 +1405,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
  ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
  endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
  ifneq ($(CUDA_PATH),)
@ -1417,6 +1463,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
  KOKKOS_TPL_LIBRARY_NAMES += pthread
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.hpp)
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
@ -1449,15 +1500,6 @@ ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
 endif

-# Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
-# device to avoid a link warning.
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-endif
-ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp,$(KOKKOS_SRC))
-endif
-
 # With Cygwin functions such as fdopen and fileno are not defined
 # when strict ansi is enabled. strict ansi gets enabled with -std=c++14
 # though. So we hard undefine it here. Not sure if that has any bad side effects
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -16,10 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
 Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
-Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
-Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
@ -41,6 +37,13 @@ Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/
 Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp

+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -50,8 +53,8 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
-Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
@ -70,20 +73,18 @@ Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
 Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
-Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
+Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
-Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
 Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 endif
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@ -10,270 +10,48 @@ hierarchies and multiple types of execution resources. It currently can use
 CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.

-Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
-which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as
-profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
+**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.**
+
+For the complete documentation, click below:
+
+# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki)

 # Learning about Kokkos

-The best way to start learning about Kokkos is going through the Kokkos Lectures.
-They are online available at https://kokkos.link/the-lectures and contain a mix
-of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
-capabilities.
+To start learning about Kokkos:

-A programming guide and API reference can be found on the Wiki
-(https://github.com/kokkos/kokkos/wiki).
+- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities.
+
+- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch.
+
+- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html).
+
+- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability.

 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.

-For non-public questions send an email to
-crtrott(at)sandia.gov
+For non-public questions send an email to: *crtrott(at)sandia.gov*

 # Contributing to Kokkos

-We are open and try to encourage contributions from external developers.
-To do so please first open an issue describing the contribution and then issue
-a pull request against the develop branch. For larger features it may be good
-to get guidance from the core development team first through the github issue.
+Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute.

-Note that Kokkos Core is licensed under standard 3-clause BSD terms of use.
-Which means contributing to Kokkos allows anyone else to use your contributions
-not just for public purposes but also for closed source commercial projects.
-For specifics see the LICENSE file contained in the repository or distribution.
+# Requirements, Building and Installing

-# Requirements
-
-### Minimum Compiler Versions
-
-Generally Kokkos should work with all compiler versions newer than the minimum.
-However as in all sufficiently complex enough code, we have to work around compiler
-bugs with almost all compilers. So compiler versions we don't test may have issues
-we are unaware of.
-
-* GCC: 5.3.0
-* Clang: 4.0.0
-* Intel: 17.0.1
-* NVCC: 9.2.88
-* NVC++: 21.5
-* ROCm: 4.3
-* MSVC: 19.29
-* IBM XL: 16.1.1
-* Fujitsu: 4.5.0
-* ARM/Clang 20.1
-
-### Primary Tested Compilers
-
-* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
-* NVCC: 9.2.88, 10.1, 11.0
-* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
-* Intel 17.4, 18.1, 19.5
-* MSVC: 19.29
-* ARM/Clang: 20.1
-* IBM XL: 16.1.1
-* ROCm: 4.3.0
-
-### Build system:
-
-* CMake >= 3.16: required
-* CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
-* CMake >= 3.21.1 for NVC++
-
-Primary tested compiler are passing in release mode
-with warnings as errors. They also are tested with a comprehensive set of
-backend combinations (i.e. OpenMP, Threads, Serial, OpenMP+Serial, ...).
-We are using the following set of flags:
-* GCC:
-   ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wignored-qualifiers -Wempty-body
-      -Wclobbered -Wuninitialized
-   ````
-* Intel:
-    ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wuninitialized
-    ````
-* Clang:
-    ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wuninitialized
-    ````
-
-* NVCC:
-  ````
-    -Wall -Wunused-parameter -Wshadow -pedantic
-    -Werror -Wsign-compare -Wtype-limits
-    -Wuninitialized
-  ````
-
-Other compilers are tested occasionally, in particular when pushing from develop to
-master branch. These are tested less rigorously without `-Werror` and only for a select set of backends.
-
-# Building and Installing Kokkos
-Kokkos provide a CMake build system and a raw Makefile build system.
-The CMake build system is strongly encouraged and will be the most rigorously supported in future releases.
-Full details are given in the [build instructions](BUILD.md). Basic setups are shown here:
-
-## CMake
-
-The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`:
-````bash
-cmake $srcdir \
-  -DCMAKE_CXX_COMPILER=$path_to_compiler \
-  -DCMAKE_INSTALL_PREFIX=$path_to_install \
-  -DKokkos_ENABLE_OPENMP=On \
-  -DKokkos_ARCH_HSW=On \
-  -DKokkos_HWLOC_DIR=$path_to_hwloc
-````
-then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
-
-To validate the Kokkos build, configure with
-````
- -DKokkos_ENABLE_TESTS=On
-````
-and run `make test` after completing the build.
-
-For your CMake project using Kokkos, code such as the following:
-
-````cmake
-find_package(Kokkos)
-...
-target_link_libraries(myTarget Kokkos::kokkos)
-````
-should be added to your CMakeLists.txt. Your configure should additionally include
-````
-DKokkos_DIR=$path_to_install/cmake/lib/Kokkos
-````
-or
-````
-DKokkos_ROOT=$path_to_install
-````
-for the install location given above.
-
-## Spack
-An alternative to manually building with the CMake is to use the Spack package manager.
-To get started, download the Spack [repo](https://github.com/spack/spack).
-````
-A basic installation would be done as:
-````bash
-> spack install kokkos
-````
-Spack allows options and and compilers to be tuned in the install command.
-````bash
-> spack install kokkos@3.0 %gcc@7.3.0 +openmp
-````
-This example illustrates the three most common parameters to Spack:
-* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
-* Version:  immediately following `kokkos` the `@version` can specify a particular Kokkos to build
-* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.
-
-For a complete list of Kokkos options, run:
-````bash
-> spack info kokkos
-````
-Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
-Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
-More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with:
-````bash
-> spack find -p kokkos ...
-````
-where `...` is the unique spec identifying the particular Kokkos configuration and version.
-Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
-
-## Raw Makefile
-
-Raw Makefiles are only supported via inline builds. See below.
-
-## Inline Builds vs. Installed Package
-For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
-The main reason is that you may otherwise need many different
-configurations of Kokkos installed depending on the required compile time
-features an application needs. For example there is only one default
-execution space, which means you need different installations to have OpenMP
-or C++ threads as the default space. Also for the CUDA backend there are certain
-choices, such as allowing relocatable device code, which must be made at
-installation time. Building Kokkos inline uses largely the same process
-as compiling an application against an installed Kokkos library.
-
-For CMake, this means copying over the Kokkos source code into your project and adding `add_subdirectory(kokkos)` to your CMakeLists.txt.
-
-For raw Makefiles, see the example benchmarks/bytes_and_flops/Makefile which can be used with an installed library and or an inline build.
-
-# Kokkos and CUDA UVM
-
-Kokkos does support UVM as a specific memory space called CudaUVMSpace.
-Allocations made with that space are accessible from host and device.
-You can tell Kokkos to use that as the default space for Cuda allocations.
-In either case UVM comes with a number of restrictions:
-* You can't access allocations on the host while a kernel is potentially
-running. This will lead to segfaults. To avoid that you either need to
-call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or
-you can set the environment variable CUDA_LAUNCH_BLOCKING=1.
-* In multi socket multi GPU machines without NVLINK, UVM defaults
-to using zero copy allocations for technical reasons related to using multiple
-GPUs from the same process. If an executable doesn't do that (e.g. each
-MPI rank of an application uses a single GPU [can be the same GPU for
-multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1.
-This will enforce proper UVM allocations, but can lead to errors if
-more than a single GPU is used by a single process.
+All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html).

+Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html).

 # Citing Kokkos

-If you publish work which mentions Kokkos, please cite the following paper:
+Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html).

-````BibTex
-@ARTICLE{9485033,
-  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
-  journal={IEEE Transactions on Parallel and Distributed Systems},
-  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
-  year={2022},
-  volume={33},
-  number={4},
-  pages={805-817},
-  doi={10.1109/TPDS.2021.3097283}}
-````
-
-If you use more than one Kokkos EcoSystem package, please also cite:
-
-````BibTex
-@ARTICLE{9502936,
-  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
-  journal={Computing in Science   Engineering},
-  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
-  year={2021},
-  volume={23},
-  number={5},
-  pages={10-18},
-  doi={10.1109/MCSE.2021.3098509}}
-````
-
-
-And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
-
-````BibTeX
-@article{CarterEdwards20143202,
-  title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
-  journal = "Journal of Parallel and Distributed Computing ",
-  volume = "74",
-  number = "12",
-  pages = "3202 - 3216",
-  year = "2014",
-  note = "Domain-Specific Languages and High-Level Frameworks for High-Performance Computing ",
-  issn = "0743-7315",
-  doi = "https://doi.org/10.1016/j.jpdc.2014.07.003",
-  url = "http://www.sciencedirect.com/science/article/pii/S0743731514001257",
-  author = "H. Carter Edwards and Christian R. Trott and Daniel Sunderland"
-}
-````
-
-##### [LICENSE](https://github.com/kokkos/kokkos/blob/master/LICENSE)
+# License

 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)

 Under the terms of Contract DE-NA0003525 with NTESS,
 the U.S. Government retains certain rights in this software.

+The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or
+[here](https://github.com/kokkos/kokkos/blob/master/LICENSE).
--- a/lib/kokkos/algorithms/src/CMakeLists.txt
+++ b/lib/kokkos/algorithms/src/CMakeLists.txt
@ -11,6 +11,7 @@ FILE(GLOB ALGO_HEADERS *.hpp)
 FILE(GLOB ALGO_SOURCES *.cpp)
 LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp)
+APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp)

 INSTALL (
  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -44,6 +44,10 @@

 #ifndef KOKKOS_RANDOM_HPP
 #define KOKKOS_RANDOM_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#endif

 #include <Kokkos_Core.hpp>
 #include <Kokkos_Complex.hpp>
@ -648,63 +652,44 @@ struct Random_UniqueIndex {
  }
 };

-#ifdef KOKKOS_ENABLE_CUDA
-template <class MemorySpace>
-struct Random_UniqueIndex<Kokkos::Device<Kokkos::Cuda, MemorySpace>> {
-  using locks_view_type =
-      View<int**, Kokkos::Device<Kokkos::Cuda, MemorySpace>>;
-  KOKKOS_FUNCTION
-  static int get_state_idx(const locks_view_type& locks_) {
-#ifdef __CUDA_ARCH__
-    const int i_offset =
-        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
-                 blockDim.x * blockDim.y * blockDim.z +
-             i_offset) %
-            locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
-      i += blockDim.x * blockDim.y * blockDim.z;
-      if (i >= static_cast<int>(locks_.extent(0))) {
-        i = i_offset;
-      }
-    }
-    return i;
-#else
-    (void)locks_;
-    return 0;
-#endif
-  }
-};
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Cuda
+#elif defined(KOKKOS_ENABLE_HIP)
+#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Experimental::HIP
 #endif

-#ifdef KOKKOS_ENABLE_HIP
 template <class MemorySpace>
 struct Random_UniqueIndex<
-    Kokkos::Device<Kokkos::Experimental::HIP, MemorySpace>> {
+    Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP, MemorySpace>> {
  using locks_view_type =
-      View<int**, Kokkos::Device<Kokkos::Experimental::HIP, MemorySpace>>;
+      View<int**, Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP,
+                                 MemorySpace>>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type& locks_) {
-#ifdef __HIP_DEVICE_COMPILE__
-    const int i_offset =
-        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
+    KOKKOS_IF_ON_DEVICE((
+        const int i_offset =
+            (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
+        int i =
+            (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
                 blockDim.x * blockDim.y * blockDim.z +
             i_offset) %
            locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
-      i += blockDim.x * blockDim.y * blockDim.z;
-      if (i >= static_cast<int>(locks_.extent(0))) {
-        i = i_offset;
-      }
-    }
-    return i;
-#else
-    (void)locks_;
-    return 0;
-#endif
+        while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
+          i += blockDim.x * blockDim.y * blockDim.z;
+          if (i >= static_cast<int>(locks_.extent(0))) {
+            i = i_offset;
+          }
+        }
+
+        return i;))
+    KOKKOS_IF_ON_HOST(((void)locks_; return 0;))
  }
 };
+
+#undef KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP
+
 #endif

 #ifdef KOKKOS_ENABLE_SYCL
@ -1279,7 +1264,6 @@ struct fill_random_functor_begin_end;
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 0,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1303,7 +1287,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 0,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1331,7 +1314,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1361,7 +1343,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1392,7 +1373,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1424,7 +1404,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1458,7 +1437,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1494,7 +1472,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1532,7 +1509,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
                                     IndexType> {
-  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;
@ -1569,34 +1545,57 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
  }
 };

-template <class ViewType, class RandomPool, class IndexType = int64_t>
-void fill_random(ViewType a, RandomPool g,
+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
                 typename ViewType::const_value_type begin,
                 typename ViewType::const_value_type end) {
  int64_t LDA = a.extent(0);
  if (LDA > 0)
-    parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
-                 Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
-                                                     ViewType::Rank, IndexType>(
-                     a, g, begin, end));
+    parallel_for(
+        "Kokkos::fill_random",
+        Kokkos::RangePolicy<ExecutionSpace>(exec, 0, (LDA + 127) / 128),
+        Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
+                                            ViewType::Rank, IndexType>(
+            a, g, begin, end));
 }

 }  // namespace Impl

+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
+                 typename ViewType::const_value_type begin,
+                 typename ViewType::const_value_type end) {
+  Impl::apply_to_view_of_static_rank(
+      [&](auto dst) { Kokkos::Impl::fill_random(exec, dst, g, begin, end); },
+      a);
+}
+
+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
+                 typename ViewType::const_value_type range) {
+  fill_random(exec, a, g, 0, range);
+}
+
 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type begin,
                 typename ViewType::const_value_type end) {
-  Impl::apply_to_view_of_static_rank(
-      [&](auto dst) { Kokkos::Impl::fill_random(dst, g, begin, end); }, a);
+  fill_random(typename ViewType::execution_space{}, a, g, begin, end);
 }

 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type range) {
-  fill_random(a, g, 0, range);
+  fill_random(typename ViewType::execution_space{}, a, g, 0, range);
 }

 }  // namespace Kokkos

+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#endif
 #endif
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -44,6 +44,10 @@

 #ifndef KOKKOS_SORT_HPP_
 #define KOKKOS_SORT_HPP_
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#endif

 #include <Kokkos_Core.hpp>

@ -120,13 +124,13 @@ class BinSort {
    // If a Kokkos::View then can generate constant random access
    // otherwise can only use the constant type.

-    using src_view_type = typename std::conditional<
+    using src_view_type = std::conditional_t<
        Kokkos::is_view<SrcViewType>::value,
        Kokkos::View<typename SrcViewType::const_data_type,
                     typename SrcViewType::array_layout,
                     typename SrcViewType::device_type,
                     Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-        typename SrcViewType::const_type>::type;
+        typename SrcViewType::const_type>;

    using perm_view_type = typename PermuteViewType::const_type;

@ -151,8 +155,11 @@ class BinSort {
    }
  };

-  using execution_space = typename Space::execution_space;
-  using bin_op_type     = BinSortOp;
+  // Naming this alias "execution_space" would be problematic since it would be
+  // considered as execution space for the various functors which might use
+  // another execution space through sort() or create_permute_vector().
+  using exec_space  = typename Space::execution_space;
+  using bin_op_type = BinSortOp;

  struct bin_count_tag {};
  struct bin_offset_tag {};
@ -171,13 +178,13 @@ class BinSort {
  // If a Kokkos::View then can generate constant random access
  // otherwise can only use the constant type.

-  using const_rnd_key_view_type = typename std::conditional<
+  using const_rnd_key_view_type = std::conditional_t<
      Kokkos::is_view<KeyViewType>::value,
      Kokkos::View<typename KeyViewType::const_data_type,
                   typename KeyViewType::array_layout,
                   typename KeyViewType::device_type,
                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-      const_key_view_type>::type;
+      const_key_view_type>;

  using non_const_key_scalar = typename KeyViewType::non_const_value_type;
  using const_key_scalar     = typename KeyViewType::const_value_type;
@ -220,6 +227,14 @@ class BinSort {
        range_begin(range_begin_),
        range_end(range_end_),
        sort_within_bins(sort_within_bins_) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    if (bin_op.max_bins() <= 0)
+      Kokkos::abort(
+          "The number of bins in the BinSortOp object must be greater than 0!");
    bin_count_atomic = Kokkos::View<int*, Space>(
        "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
    bin_count_const = bin_count_atomic;
@ -235,7 +250,7 @@ class BinSort {

  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
          BinSortOp bin_op_, bool sort_within_bins_ = false)
-      : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
+      : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_,
                sort_within_bins_) {}

  template <typename ExecutionSpace>
@ -245,13 +260,19 @@ class BinSort {

  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
          bool sort_within_bins_ = false)
-      : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}
+      : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {}

  //----------------------------------------
  // Create the permutation vector, the bin_offset array and the bin_count
  // array. Can be called again if keys changed
-  template <class ExecutionSpace = execution_space>
-  void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
+  template <class ExecutionSpace = exec_space>
+  void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+
    const size_t len = range_end - range_begin;
    Kokkos::parallel_for(
        "Kokkos::Sort::BinCount",
@ -281,6 +302,17 @@ class BinSort {
  template <class ExecutionSpace, class ValuesViewType>
  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
            int values_range_begin, int values_range_end) const {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    static_assert(
+        Kokkos::SpaceAccessibility<
+            ExecutionSpace, typename ValuesViewType::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "of the View argument!");
+
    using scratch_view_type =
        Kokkos::View<typename ValuesViewType::data_type,
                     typename ValuesViewType::array_layout,
@ -340,7 +372,7 @@ class BinSort {
  template <class ValuesViewType>
  void sort(ValuesViewType const& values, int values_range_begin,
            int values_range_end) const {
-    execution_space exec;
+    exec_space exec;
    sort(exec, values, values_range_begin, values_range_end);
    exec.fence("Kokkos::Sort: fence after sorting");
  }
@ -428,7 +460,7 @@ struct BinOp1D {

  BinOp1D() = default;

-  // Construct BinOp with number of bins, minimum value and maxuimum value
+  // Construct BinOp with number of bins, minimum value and maximum value
  BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
          typename KeyViewType::const_value_type max)
      : max_bins_(max_bins__ + 1),
@ -554,11 +586,7 @@ struct min_max_functor {

 template <class ExecutionSpace, class ViewType>
 std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
-    const ExecutionSpace& exec, ViewType const& view,
-    bool const always_use_kokkos_sort = false) {
-  if (!always_use_kokkos_sort) {
-    if (Impl::try_std_sort(view, exec)) return;
-  }
+    const ExecutionSpace& exec, ViewType const& view) {
  using CompType = BinOp1D<ViewType>;

  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
@ -596,12 +624,38 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
  bin_sort.sort(exec, view);
 }

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class ExecutionSpace, class ViewType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload not taking bool always_use_kokkos_sort")
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType const& view,
+    bool const always_use_kokkos_sort) {
+  if (!always_use_kokkos_sort && Impl::try_std_sort(view, exec)) {
+    return;
+  } else {
+    sort(exec, view);
+  }
+}
+#endif
+
 template <class ViewType>
-void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+void sort(ViewType const& view) {
+  typename ViewType::execution_space exec;
+  sort(exec, view);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class ViewType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload not taking bool always_use_kokkos_sort")
+void sort(ViewType const& view, bool const always_use_kokkos_sort) {
  typename ViewType::execution_space exec;
  sort(exec, view, always_use_kokkos_sort);
  exec.fence("Kokkos::Sort: fence after sorting");
 }
+#endif

 template <class ExecutionSpace, class ViewType>
 std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
@ -635,4 +689,8 @@ void sort(ViewType view, size_t const begin, size_t const end) {

 }  // namespace Kokkos

+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#endif
 #endif
--- a/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
@ -44,59 +44,103 @@

 #ifndef KOKKOS_STD_ALGORITHMS_HPP
 #define KOKKOS_STD_ALGORITHMS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#endif

 /// \file Kokkos_StdAlgorithms.hpp
 /// \brief Kokkos counterparts for Standard C++ Library algorithms

-#include <std_algorithms/Kokkos_Constraints.hpp>
-#include <std_algorithms/Kokkos_RandomAccessIterator.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
+#include "std_algorithms/impl/Kokkos_Constraints.hpp"
+#include "std_algorithms/impl/Kokkos_RandomAccessIterator.hpp"
+#include "std_algorithms/Kokkos_BeginEnd.hpp"

 // distance
-#include <std_algorithms/Kokkos_Distance.hpp>
+#include "std_algorithms/Kokkos_Distance.hpp"

-// move, swap, iter_swap
-#include "std_algorithms/Kokkos_ModifyingOperations.hpp"
+// note that we categorize below the headers
+// following the std classification.

-// find, find_if, find_if_not
-// for_each, for_each_n
-// mismatch
-// equal
-// count_if, count
-// all_of, any_of, none_of
-// adjacent_find
-// lexicographical_compare
-// search, search_n
-// find_first_of, find_end
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
+// modifying ops
+#include "std_algorithms/Kokkos_Swap.hpp"
+#include "std_algorithms/Kokkos_IterSwap.hpp"

-// replace, replace_copy_if, replace_copy, replace_if
-// copy, copy_n, copy_backward, copy_if
-// fill, fill_n
-// transform
-// generate, generate_n
-// reverse, reverse_copy
-// move, move_backward
-// swap_ranges
-// unique, unique_copy
-// rotate, rotate_copy
-// remove, remove_if, remove_copy, remove_copy_if
-// shift_left, shift_right
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
+// non-modifying sequence
+#include "std_algorithms/Kokkos_AdjacentFind.hpp"
+#include "std_algorithms/Kokkos_Count.hpp"
+#include "std_algorithms/Kokkos_CountIf.hpp"
+#include "std_algorithms/Kokkos_AllOf.hpp"
+#include "std_algorithms/Kokkos_AnyOf.hpp"
+#include "std_algorithms/Kokkos_NoneOf.hpp"
+#include "std_algorithms/Kokkos_Equal.hpp"
+#include "std_algorithms/Kokkos_Find.hpp"
+#include "std_algorithms/Kokkos_FindIf.hpp"
+#include "std_algorithms/Kokkos_FindIfNot.hpp"
+#include "std_algorithms/Kokkos_FindEnd.hpp"
+#include "std_algorithms/Kokkos_FindFirstOf.hpp"
+#include "std_algorithms/Kokkos_ForEach.hpp"
+#include "std_algorithms/Kokkos_ForEachN.hpp"
+#include "std_algorithms/Kokkos_LexicographicalCompare.hpp"
+#include "std_algorithms/Kokkos_Mismatch.hpp"
+#include "std_algorithms/Kokkos_Search.hpp"
+#include "std_algorithms/Kokkos_SearchN.hpp"

-// is_sorted_until, is_sorted
-#include <std_algorithms/Kokkos_SortingOperations.hpp>
+// modifying sequence
+#include "std_algorithms/Kokkos_Fill.hpp"
+#include "std_algorithms/Kokkos_FillN.hpp"
+#include "std_algorithms/Kokkos_Replace.hpp"
+#include "std_algorithms/Kokkos_ReplaceIf.hpp"
+#include "std_algorithms/Kokkos_ReplaceCopyIf.hpp"
+#include "std_algorithms/Kokkos_ReplaceCopy.hpp"
+#include "std_algorithms/Kokkos_Copy.hpp"
+#include "std_algorithms/Kokkos_CopyN.hpp"
+#include "std_algorithms/Kokkos_CopyBackward.hpp"
+#include "std_algorithms/Kokkos_CopyIf.hpp"
+#include "std_algorithms/Kokkos_Transform.hpp"
+#include "std_algorithms/Kokkos_Generate.hpp"
+#include "std_algorithms/Kokkos_GenerateN.hpp"
+#include "std_algorithms/Kokkos_Reverse.hpp"
+#include "std_algorithms/Kokkos_ReverseCopy.hpp"
+#include "std_algorithms/Kokkos_Move.hpp"
+#include "std_algorithms/Kokkos_MoveBackward.hpp"
+#include "std_algorithms/Kokkos_SwapRanges.hpp"
+#include "std_algorithms/Kokkos_Unique.hpp"
+#include "std_algorithms/Kokkos_UniqueCopy.hpp"
+#include "std_algorithms/Kokkos_Rotate.hpp"
+#include "std_algorithms/Kokkos_RotateCopy.hpp"
+#include "std_algorithms/Kokkos_Remove.hpp"
+#include "std_algorithms/Kokkos_RemoveIf.hpp"
+#include "std_algorithms/Kokkos_RemoveCopy.hpp"
+#include "std_algorithms/Kokkos_RemoveCopyIf.hpp"
+#include "std_algorithms/Kokkos_ShiftLeft.hpp"
+#include "std_algorithms/Kokkos_ShiftRight.hpp"

-// min_element, max_element, minmax_element
-#include <std_algorithms/Kokkos_MinMaxElementOperations.hpp>
+// sorting
+#include "std_algorithms/Kokkos_IsSortedUntil.hpp"
+#include "std_algorithms/Kokkos_IsSorted.hpp"

-// is_partitioned, partition_copy, partition_point
-#include <std_algorithms/Kokkos_PartitioningOperations.hpp>
+// min/max element
+#include "std_algorithms/Kokkos_MinElement.hpp"
+#include "std_algorithms/Kokkos_MaxElement.hpp"
+#include "std_algorithms/Kokkos_MinMaxElement.hpp"

-// adjacent_difference
-// reduce, transform_reduce
-// exclusive_scan, transform_exclusive_scan
-// inclusive_scan, transform_inclusive_scan
-#include <std_algorithms/Kokkos_Numeric.hpp>
+// partitioning
+#include "std_algorithms/Kokkos_IsPartitioned.hpp"
+#include "std_algorithms/Kokkos_PartitionCopy.hpp"
+#include "std_algorithms/Kokkos_PartitionPoint.hpp"

+// numeric
+#include "std_algorithms/Kokkos_AdjacentDifference.hpp"
+#include "std_algorithms/Kokkos_Reduce.hpp"
+#include "std_algorithms/Kokkos_TransformReduce.hpp"
+#include "std_algorithms/Kokkos_ExclusiveScan.hpp"
+#include "std_algorithms/Kokkos_TransformExclusiveScan.hpp"
+#include "std_algorithms/Kokkos_InclusiveScan.hpp"
+#include "std_algorithms/Kokkos_TransformInclusiveScan.hpp"
+
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#endif
 #endif
--- a/Show More
+++ b/Show More