Merge branch 'lammps:develop' into ml-uf3

2024-04-13 18:56:07 -04:00
parent c502dd4033 1346be4168
commit 01b1d047a2
476 changed files with 16747 additions and 14762 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -84,7 +84,7 @@ src/bond.*                @sjplimp
 src/comm*.*               @sjplimp
 src/compute.*             @sjplimp
 src/dihedral.*            @sjplimp
-src/domain.*              @sjplimp
+src/domain.*              @sjplimp @stanmoore1
 src/dump*.*               @sjplimp
 src/error.*               @sjplimp
 src/finish.*              @sjplimp
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -45,8 +45,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.2.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "16b9b09ae947d434dfb58fc5c87c2b76" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.3.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "889dcea2b5ced3debdc5b0820044bdc4" CACHE STRING "MD5 checksum of KOKKOS tarball")
  mark_as_advanced(KOKKOS_URL)
  mark_as_advanced(KOKKOS_MD5)
  GetFallbackURL(KOKKOS_URL KOKKOS_FALLBACK)
@ -71,7 +71,7 @@ if(DOWNLOAD_KOKKOS)
  add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
  add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 4.2.01 REQUIRED CONFIG)
+  find_package(Kokkos 4.3.00 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
 else()
  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -533,9 +533,6 @@ They must be specified in uppercase.
   *  - A64FX
      - HOST
      - ARMv8.2 with SVE Support
-   *  - WSM
-      - HOST
-      - Intel Westmere CPU (SSE 4.2)
   *  - SNB
      - HOST
      - Intel Sandy/Ivy Bridge CPU (AVX 1)
@ -566,18 +563,15 @@ They must be specified in uppercase.
   *  - KNL
      - HOST
      - Intel Knights Landing Xeon Phi
-   *  - BGQ
-      - HOST
-      - IBM Blue Gene/Q CPU
-   *  - POWER7
-      - HOST
-      - IBM POWER7 CPU
   *  - POWER8
      - HOST
      - IBM POWER8 CPU
   *  - POWER9
      - HOST
      - IBM POWER9 CPU
+   *  - RISCV_SG2042
+      - HOST
+      - SG2042 (RISC-V) CPU
   *  - KEPLER30
      - GPU
      - NVIDIA Kepler generation CC 3.0 GPU
@ -666,7 +660,7 @@ They must be specified in uppercase.
      - GPU
      - Intel GPU Ponte Vecchio

-This list was last updated for version 4.2 of the Kokkos library.
+This list was last updated for version 4.3.0 of the Kokkos library.

 .. tabs::

--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -245,6 +245,7 @@ OPT.
   * :doc:`oxrna2/coaxstk <pair_oxrna2>`
   * :doc:`pace (k) <pair_pace>`
   * :doc:`pace/extrapolation (k) <pair_pace>`
+   * :doc:`pedone (o) <pair_pedone>`
   * :doc:`pod <pair_pod>`
   * :doc:`peri/eps <pair_peri>`
   * :doc:`peri/lps (o) <pair_peri>`
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@ -635,10 +635,10 @@ Tohoku University (under MIT license)

 ----------

-.. doxygenfunction:: MathEigen::jacobi3(double const *const *mat, double *eval, double **evec)
+.. doxygenfunction:: MathEigen::jacobi3(double const *const *mat, double *eval, double **evec, int sort)
   :project: progguide

-.. doxygenfunction:: MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3])
+.. doxygenfunction:: MathEigen::jacobi3(double const mat[3][3], double *eval, double evec[3][3], int sort)
   :project: progguide

 ---------------------------
--- a/doc/src/Errors_details.rst
+++ b/doc/src/Errors_details.rst
@ -13,15 +13,44 @@ discussions of such cases.
 Unknown identifier in data file
 -------------------------------

-This error happens when LAMMPS encounters a line of text in an unexpected format
-while reading a data file. This is most commonly cause by inconsistent header and
-section data.  The header section informs LAMMPS how many entries or lines are expected in the
-various sections (like Atoms, Masses, Pair Coeffs, *etc.*\ ) of the data file.
-If there is a mismatch, LAMMPS will either keep reading beyond the end of a section
-or stop reading before the section has ended.
+This error happens when LAMMPS encounters a line of text with an
+unexpected keyword while :doc:`reading a data file <read_data>`.  This
+would be either header keywords or section header keywords.  This is
+most commonly due to a mistyped keyword or due to a keyword that is
+inconsistent with the :doc:`atom style <atom_style>` used.

-Such a mismatch can happen unexpectedly when the first line of the data
-is *not* a comment as required by the format.  That would result in
-LAMMPS expecting, for instance, 0 atoms because the "atoms" header line
-is treated as a comment.
+The header section informs LAMMPS how many entries or lines are expected
+in the various sections (like Atoms, Masses, Pair Coeffs, *etc.*\ ) of
+the data file.  If there is a mismatch, LAMMPS will either keep reading
+beyond the end of a section or stop reading before the section has
+ended.  In that case the next line will not contain a recognized keyword.

+Such a mismatch can also happen when the first line of the data
+is *not* a comment as required by the format, but a line with a valid
+header keyword.  That would result in LAMMPS expecting, for instance,
+0 atoms because the "atoms" header line is the first line and thus
+treated as a comment.
+
+Another possibility to trigger this error is to have a keyword in the
+data file that corresponds to a fix (e.g. :doc:`fix cmap <fix_cmap>`)
+but the :doc:`read_data <read_data>` command is missing the (optional)
+arguments that identify the fix and the header keyword and section
+keyword or those arguments are inconsistent with the keywords in the
+data file.
+
+.. _err0002:
+
+Incorrect format in ... section of data file
+--------------------------------------------
+
+This error happens when LAMMPS reads the contents of a section of a
+:doc:`data file <read_data>` and the number of parameters in the line
+differs from what is expected.  This most commonly happens, when the
+atom style is different from what is expected for a specific data file
+since changing the atom style usually changes the format of the line.
+
+This error can also happen when the number of entries indicated in the
+header of a data file (e.g. the number of atoms) is larger than the
+number of lines provided (e.g. in the corresponding Atoms section)
+and then LAMMPS will continue reading into the next section and that
+would have a completely different format.
--- a/doc/src/fix_ave_correlate.rst
+++ b/doc/src/fix_ave_correlate.rst
@ -65,7 +65,6 @@ Examples
   fix 1 all ave/correlate 1 50 10000 &
             c_thermo_press[1] c_thermo_press[2] c_thermo_press[3] &
             type upper ave running title1 "My correlation data"
-
   fix 1 all ave/correlate 1 50 10000 c_thermo_press[*]

 Description
--- a/doc/src/fix_ave_correlate_long.rst
+++ b/doc/src/fix_ave_correlate_long.rst
@ -20,11 +20,11 @@ Syntax
  .. parsed-literal::

       c_ID = global scalar calculated by a compute with ID
-       c_ID[I] = Ith component of global vector calculated by a compute with ID
+       c_ID[I] = Ith component of global vector calculated by a compute with ID, I can include wildcard (see below)
       f_ID = global scalar calculated by a fix with ID
-       f_ID[I] = Ith component of global vector calculated by a fix with ID
+       f_ID[I] = Ith component of global vector calculated by a fix with ID, I can include wildcard (see below)
       v_name = global value calculated by an equal-style variable with name
-       v_name[I] = Ith component of global vector calculated by a vector-style variable with name
+       v_name[I] = Ith component of a vector-style variable with name, I can include wildcard (see below)

 * zero or more keyword/arg pairs may be appended
 * keyword = *type* or *start* or *file* or *overwrite* or *title1* or *title2* or *ncorr* or *nlen* or *ncount*
@ -63,6 +63,7 @@ Examples
   fix 1 all ave/correlate/long 1 10000 &
             c_thermo_press[1] c_thermo_press[2] c_thermo_press[3] &
             type upper title1 "My correlation data" nlen 15 ncount 3
+   fix 1 all ave/correlate/long 1 10000 c_thermo_press[*]

 Description
 """""""""""
@ -80,8 +81,10 @@ specified values may represent calculations performed by computes and
 fixes which store their own "group" definitions.

 Each listed value can be the result of a compute or fix or the
-evaluation of an equal-style variable. See the
-:doc:`fix ave/correlate <fix_ave_correlate>` page for details.
+evaluation of an equal-style or vector-style variable.  For
+vector-style variables, the specified indices can include a wildcard
+character.  See the :doc:`fix ave/correlate <fix_ave_correlate>` page
+for details.

 The *Nevery* and *Nfreq* arguments specify on what time steps the input
 values will be used to calculate correlation data and the frequency
--- a/doc/src/fix_ttm.rst
+++ b/doc/src/fix_ttm.rst
@ -136,23 +136,23 @@ transfer between the subsystems:
  \bigtriangledown (\kappa_e \bigtriangledown T_e) -
  g_p (T_e - T_a) + g_s T_a'

-where C_e is the specific heat, rho_e is the density, kappa_e is the
-thermal conductivity, T is temperature, the "e" and "a" subscripts
-represent electronic and atomic subsystems respectively, g_p is the
-coupling constant for the electron-ion interaction, and g_s is the
-electron stopping coupling parameter.  C_e, rho_e, and kappa_e are
-specified as parameters to the fix.  The other quantities are derived.
-The form of the heat diffusion equation used here is almost the same
-as that in equation 6 of :ref:`(Duffy) <Duffy>`, with the exception that the
-electronic density is explicitly represented, rather than being part
-of the specific heat parameter.
+where :math:`C_e` is the specific heat, :math:`\rho_e` is the density,
+:math:`\kappa_e` is the thermal conductivity, *T* is temperature, the
+"e" and "a" subscripts represent electronic and atomic subsystems
+respectively, :math:`g_p` is the coupling constant for the electron-ion
+interaction, and :math:`g_s` is the electron stopping coupling
+parameter.  :math:`C_e`, :math:`\rho_e`, and :math:`\kappa_e` are
+specified as parameters to the fix *ttm* or *ttm/grid*.  The other
+quantities are derived.  The form of the heat diffusion equation used
+here is almost the same as that in equation 6 of :ref:`(Duffy) <Duffy>`,
+with the exception that the electronic density is explicitly
+represented, rather than being part of the specific heat parameter.

 Currently, the TTM fixes assume that none of the user-supplied
-parameters will vary with temperature. Note that :ref:`(Duffy)
-<Duffy>` used a tanh() functional form for the temperature dependence
-of the electronic specific heat, but ignored temperature dependencies
-of any of the other parameters.  See more discussion below for fix
-ttm/mod.
+parameters will vary with temperature. Note that :ref:`(Duffy) <Duffy>`
+used a tanh() functional form for the temperature dependence of the
+electronic specific heat, but ignored temperature dependencies of any of
+the other parameters.  See more discussion below for fix *ttm/mod*.

 .. note::

@ -265,27 +265,27 @@ heat sources (e.g. laser heating in ablation simulations):
  \bigtriangledown (\kappa_e \bigtriangledown T_e) -
  g_p (T_e - T_a) + g_s T_a' + \theta (x-x_{surface})I_0 \exp(-x/l_{skin})

-where theta is the Heaviside step function, I_0 is the (absorbed)
-laser pulse intensity for ablation simulations, l_skin is the depth
-of skin-layer, and all other designations have the same meaning as in
-the former equation. The duration of the pulse is set by the parameter
-*tau* in the *init_file*.
+where :math:`\theta` is the Heaviside step function, :math:`I_0` is the
+(absorbed) laser pulse intensity for ablation simulations,
+:math:`l_{skin}` is the depth of the skin-layer, and all other
+designations have the same meaning as in the former equation. The
+duration of the pulse is set by the parameter *tau* in the *init_file*.

-Fix ttm/mod also allows users to specify the dependencies of C_e and
-kappa_e on the electronic temperature. The specific heat is expressed
-as
+Fix *ttm/mod* also allows users to specify the dependencies of
+:math:`C_e` and :math:`\kappa_e` on the electronic temperature. The
+specific heat is expressed as

 .. math::

  C_e = C_0 + (a_0 + a_1 X + a_2 X^2 + a_3 X^3 + a_4 X^4) \exp (-(AX)^2)

-where *X* = T_e/1000, and the thermal conductivity is defined as
-kappa_e = D_e\*rho_e\*C_e, where D_e is the thermal diffusion
-coefficient.
+where :math:`X = \frac{T_e}{1000}`, and the thermal conductivity is
+defined as :math:`\kappa_e = D_e \cdot rho_e \cdot C_e`, where
+:math:`D_e` is the thermal diffusion coefficient.

-Electronic pressure effects are included in the TTM model to account
-for the blast force acting on ions because of electronic pressure
-gradient (see :ref:`(Chen) <Chen>`, :ref:`(Norman) <Norman>`).  The total force
+Electronic pressure effects are included in the TTM model to account for
+the blast force acting on ions because of electronic pressure gradient
+(see :ref:`(Chen) <Chen>`, :ref:`(Norman) <Norman>`).  The total force
 acting on an ion is:

 .. math::
@ -293,13 +293,14 @@ acting on an ion is:
  {\vec F}_i = - \partial U / \partial {\vec r}_i + {\vec
  F}_{langevin} - \nabla P_e/n_{ion}

-where F_langevin is a force from Langevin thermostat simulating
-electron-phonon coupling, and nabla P_e/n_ion is the electron blast
-force.
+where :math:`F_{langevin}` is a force from Langevin thermostat
+simulating electron-phonon coupling, and :math:`\nabla P_e/n_{ion}` is
+the electron blast force.

-The electronic pressure is taken to be P_e = B\*rho_e\*C_e\*T_e
+The electronic pressure is taken to be :math:`P_e = B \cdot rho_e \cdot
+C_e \cdot T_e`

-The current fix ttm/mod implementation allows TTM simulations with a
+The current fix *ttm/mod* implementation allows TTM simulations with a
 vacuum. The vacuum region is defined as the grid cells with zero
 electronic temperature. The numerical scheme does not allow energy
 exchange with such cells. Since the material can expand to previously
@ -319,10 +320,10 @@ electronic pressure gradient is calculated as
  \frac{x}{x+\lambda}\frac{(C_e{}T_e)_{x+\Delta
  x}-(C_e{}T_e)_{x}}{\Delta x} \right]

-where lambda is the electron mean free path (see :ref:`(Norman) <Norman>`,
-:ref:`(Pisarev) <Pisarev>`)
+where :math:`\lambda` is the electron mean free path (see :ref:`(Norman)
+<Norman>`, :ref:`(Pisarev) <Pisarev>`)

-The fix ttm/mod parameter file *init_file* has the following syntax.
+The fix *ttm/mod* parameter file *init_file* has the following syntax.
 Every line with an odd number is considered as a comment and
 ignored. The lines with the even numbers are treated as follows:

--- a/doc/src/pair_pedone.rst
+++ b/doc/src/pair_pedone.rst
@ -0,0 +1,137 @@
+.. index:: pair_style pedone
+.. index:: pair_style pedone/omp
+
+pair_style pedone command
+=========================
+
+Accelerator Variants: *pedone/omp*
+
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   pair_style style args
+
+* style = pedone*
+* args = list of arguments for a particular style
+
+.. parsed-literal::
+
+    *pedone* args = cutoff
+      cutoff = global cutoff for Pedone interactions (distance units)
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-5
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+Used in input scripts:
+
+   .. parsed-literal::
+
+      examples/PACKAGES/pedone/in.pedone.relax
+      examples/PACKAGES/pedone/in.pedone.melt
+
+
+
+Description
+"""""""""""
+
+.. versionadded:: TBD
+
+Pair style *pedone* computes the **non-Coulomb** interactions of the Pedone
+(or PMMCS) potential :ref:`(Pedone) <Pedone>` which combines Coulomb
+interactions, Morse potential, and repulsive :math:`r^{-12}`
+Lennard-Jones terms (see below).  The *pedone* pair style is meant
+to be used in addition to a :doc:`Coulomb pair style <pair_coul>` via
+pair style :doc:`hybrid/overlay <pair_hybrid>` (see example above).
+Using *coul/long* or *could/dsf* (for solids) is recommended.
+
+The full Pedone potential function from :ref:`(Pedone) <Pedone>` for each
+pair of atoms is:
+
+.. math::
+
+   E =  \frac{C q_i q_j}{\epsilon  r}
+       + D_0 \left[ e^{- 2 \alpha (r - r_0)} - 2 e^{- \alpha (r - r_0)} \right]
+       + \frac{B_0}{r^{12}} \qquad r < r_c
+
+:math:`r_c` is the cutoff and :math:`C` is a conversion factor that is
+specific to the choice of :doc:`units <units>` so that the entire
+Coulomb term is in energy units with :math:`q_i` and :math:`q_j` as the
+assigned charges in multiples of the elementary charge.
+
+The following coefficients must be defined for the selected pairs of
+atom types via the :doc:`pair_coeff <pair_coeff>` command as in the
+example above:
+
+* :math:`D_0` (energy units)
+* :math:`\alpha` (1/distance units)
+* :math:`r_0` (distance units)
+* :math:`C_0` (energy units)
+* cutoff (distance units)
+
+The last coefficient is optional.  If not specified, the global *pedone*
+cutoff is used.
+
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
+Mixing, shift, table, tail correction, restart, rRESPA info
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+This pair style does not support mixing.
+
+This pair style support the :doc:`pair_modify <pair_modify>` shift
+option for the energy of the pair interaction.
+
+This pair style does not support the :doc:`pair_modify <pair_modify>`
+tail option for adding long-range tail corrections to energy and
+pressure.
+
+This pair style writes its information to :doc:`binary restart files <restart>`,
+so pair_style and pair_coeff commands does not need to be specified in an input
+script that reads a restart file.
+
+This pair style can only be used via the *pair* keyword of the
+:doc:`run_style respa <run_style>` command.  It does not support the
+*inner*, *middle*, or *outer* keywords.
+
+----------
+
+Restrictions
+""""""""""""
+
+The *pedone* pair style is only enabled if LAMMPS was built with the
+EXTRA-PAIR package.  See the :doc:`Build package <Build_package>` page
+for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`pair_coeff <pair_coeff>`, :doc:`pair_style <pair_style>`,
+:doc:`pair style coul/long and coul/dsf <pair_coul>`,
+:doc:`pair style morse <pair_morse>`
+
+Default
+"""""""
+
+none
+
+-------------
+
+.. _Pedone:
+
+**(Pedone)** A. Pedone, G. Malavasi, M. C. Menziani, A. N. Cormack, and U. Segre, J. Phys. Chem. B, 110, 11780 (2006)
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@ -275,30 +275,30 @@ accelerated styles exist.
 * :doc:`lj/smooth/linear <pair_lj_smooth_linear>` - linear smoothed LJ potential
 * :doc:`lj/switch3/coulgauss/long <pair_lj_switch3_coulgauss_long>` - smoothed LJ vdW potential with Gaussian electrostatics
 * :doc:`lj96/cut <pair_lj96>` - Lennard-Jones 9/6 potential
-* :doc:`local/density <pair_local_density>` - generalized basic local density potential
-* :doc:`lubricate <pair_lubricate>` - hydrodynamic lubrication forces
-* :doc:`lubricate/poly <pair_lubricate>` - hydrodynamic lubrication forces with polydispersity
-* :doc:`lubricateU <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication Dynamics
-* :doc:`lubricateU/poly <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity
+* :doc:`local/density <pair_local_density>` - Generalized basic local density potential
+* :doc:`lubricate <pair_lubricate>` - Hydrodynamic lubrication forces
+* :doc:`lubricate/poly <pair_lubricate>` - Hydrodynamic lubrication forces with polydispersity
+* :doc:`lubricateU <pair_lubricateU>` - Hydrodynamic lubrication forces for Fast Lubrication Dynamics
+* :doc:`lubricateU/poly <pair_lubricateU>` - Hydrodynamic lubrication forces for Fast Lubrication with polydispersity
 * :doc:`mdpd <pair_mesodpd>` - mDPD particle interactions
 * :doc:`mdpd/rhosum <pair_mesodpd>` - mDPD particle interactions for mass density
-* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM)
-* :doc:`meam/ms <pair_meam>` - multi-state modified embedded atom method (MS-MEAM)
-* :doc:`meam/spline <pair_meam_spline>` - splined version of MEAM
-* :doc:`meam/sw/spline <pair_meam_sw_spline>` - splined version of MEAM with a Stillinger-Weber term
-* :doc:`mesocnt <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes
-* :doc:`mesocnt/viscous <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes with friction
-* :doc:`mgpt <pair_mgpt>` - simplified model generalized pseudopotential theory (MGPT) potential
+* :doc:`meam <pair_meam>` - Modified embedded atom method (MEAM)
+* :doc:`meam/ms <pair_meam>` - Multi-state modified embedded atom method (MS-MEAM)
+* :doc:`meam/spline <pair_meam_spline>` - Splined version of MEAM
+* :doc:`meam/sw/spline <pair_meam_sw_spline>` - Splined version of MEAM with a Stillinger-Weber term
+* :doc:`mesocnt <pair_mesocnt>` - Mesoscopic vdW potential for (carbon) nanotubes
+* :doc:`mesocnt/viscous <pair_mesocnt>` - Mesoscopic vdW potential for (carbon) nanotubes with friction
+* :doc:`mgpt <pair_mgpt>` - Simplified model generalized pseudopotential theory (MGPT) potential
 * :doc:`mie/cut <pair_mie>` - Mie potential
-* :doc:`mm3/switch3/coulgauss/long <pair_lj_switch3_coulgauss_long>` - smoothed MM3 vdW potential with Gaussian electrostatics
+* :doc:`mm3/switch3/coulgauss/long <pair_lj_switch3_coulgauss_long>` - Smoothed MM3 vdW potential with Gaussian electrostatics
 * :doc:`momb <pair_momb>` - Many-Body Metal-Organic (MOMB) force field
 * :doc:`morse <pair_morse>` - Morse potential
-* :doc:`morse/smooth/linear <pair_morse>` - linear smoothed Morse potential
+* :doc:`morse/smooth/linear <pair_morse>` - Linear smoothed Morse potential
 * :doc:`morse/soft <pair_morse>` - Morse potential with a soft core
 * :doc:`multi/lucy <pair_multi_lucy>` - DPD potential with density-dependent force
 * :doc:`multi/lucy/rx <pair_multi_lucy_rx>` - reactive DPD potential with density-dependent force
-* :doc:`nb3b/harmonic <pair_nb3b>` - non-bonded 3-body harmonic potential
-* :doc:`nb3b/screened <pair_nb3b>` - non-bonded 3-body screened harmonic potential
+* :doc:`nb3b/harmonic <pair_nb3b>` - Non-bonded 3-body harmonic potential
+* :doc:`nb3b/screened <pair_nb3b>` - Non-bonded 3-body screened harmonic potential
 * :doc:`nm/cut <pair_nm>` - N-M potential
 * :doc:`nm/cut/coul/cut <pair_nm>` - N-M potential with cutoff Coulomb
 * :doc:`nm/cut/coul/long <pair_nm>` - N-M potential with long-range Coulomb
@ -322,21 +322,22 @@ accelerated styles exist.
 * :doc:`oxrna2/xstk <pair_oxrna2>` -
 * :doc:`pace <pair_pace>` - Atomic Cluster Expansion (ACE) machine-learning potential
 * :doc:`pace/extrapolation <pair_pace>` - Atomic Cluster Expansion (ACE) machine-learning potential with extrapolation grades
+* :doc:`pedone <pair_pedone>` - Pedone (PMMCS) potential (non-Coulomb part)
 * :doc:`pod <pair_pod>` - Proper orthogonal decomposition (POD) machine-learning potential
-* :doc:`peri/eps <pair_peri>` - peridynamic EPS potential
-* :doc:`peri/lps <pair_peri>` - peridynamic LPS potential
-* :doc:`peri/pmb <pair_peri>` - peridynamic PMB potential
-* :doc:`peri/ves <pair_peri>` - peridynamic VES potential
-* :doc:`polymorphic <pair_polymorphic>` - polymorphic 3-body potential
+* :doc:`peri/eps <pair_peri>` - Peridynamic EPS potential
+* :doc:`peri/lps <pair_peri>` - Peridynamic LPS potential
+* :doc:`peri/pmb <pair_peri>` - Peridynamic PMB potential
+* :doc:`peri/ves <pair_peri>` - Peridynamic VES potential
+* :doc:`polymorphic <pair_polymorphic>` - Polymorphic 3-body potential
 * :doc:`python <pair_python>` -
 * :doc:`quip <pair_quip>` -
 * :doc:`rann <pair_rann>` -
 * :doc:`reaxff <pair_reaxff>` - ReaxFF potential
-* :doc:`rebo <pair_airebo>` - second generation REBO potential of Brenner
+* :doc:`rebo <pair_airebo>` - Second generation REBO potential of Brenner
 * :doc:`rebomos <pair_rebomos>` - REBOMoS potential for MoS2
 * :doc:`resquared <pair_resquared>` - Everaers RE-Squared ellipsoidal potential
-* :doc:`saip/metal <pair_saip_metal>` - interlayer potential for hetero-junctions formed with hexagonal 2D materials and metal surfaces
-* :doc:`sdpd/taitwater/isothermal <pair_sdpd_taitwater_isothermal>` - smoothed dissipative particle dynamics for water at isothermal conditions
+* :doc:`saip/metal <pair_saip_metal>` - Interlayer potential for hetero-junctions formed with hexagonal 2D materials and metal surfaces
+* :doc:`sdpd/taitwater/isothermal <pair_sdpd_taitwater_isothermal>` - Smoothed dissipative particle dynamics for water at isothermal conditions
 * :doc:`smatb <pair_smatb>` - Second Moment Approximation to the Tight Binding
 * :doc:`smatb/single <pair_smatb>` - Second Moment Approximation to the Tight Binding for single-element systems
 * :doc:`smd/hertz <pair_smd_hertz>` -
--- a/doc/src/variable.rst
+++ b/doc/src/variable.rst
@ -279,9 +279,9 @@ This means the variable can then be evaluated as many times as desired
 and will return those values.  There are two ways to cause the next
 set of per-atom values from the file to be read: use the
 :doc:`next <next>` command or the next() function in an atom-style
-variable, as discussed below.  Unlike most variable styles
-atomfile-style variables are **deleted** during a :doc:`clear <clear>`
-command.
+variable, as discussed below.  Unlike most variable styles, which
+remain defined, atomfile-style variables are **deleted** during a
+:doc:`clear <clear>` command.

 The rules for formatting the file are as follows.  Each time a set of
 per-atom values is read, a non-blank line is searched for in the file.
@ -289,23 +289,37 @@ The file is read line by line but only up to 254 characters are used.
 The rest are ignored.  A comment character "#" can be used anywhere
 on a line and all text following and the "#" character are ignored;
 text starting with the comment character is stripped.  Blank lines
-are skipped.  The first "word" of a non-blank line, delimited by
-white-space, is read as the count N of per-atom lines to immediately
-follow.  N can be the total number of atoms in the system, or only a
-subset.  The next N lines have the following format
-
-.. parsed-literal::
-
-   ID value
-
-where ID is an atom ID and value is the per-atom numeric value that
-will be assigned to that atom.  IDs can be listed in any order.
+are skipped.  The first non-blank line is expected to contain a single
+integer number as the count *N* of per-atom lines to follow.  *N* can
+be the total number of atoms in the system or less, indicating that data
+for a subset is read.  The next N lines must consist of two numbers,
+the atom-ID of the atom for which a value is set followed by a floating
+point number with the value.  The atom-IDs may be listed in any order.

 .. note::

-   Every time a set of per-atom lines is read, the value for all
-   atoms is first set to 0.0.  Thus values for atoms whose ID does not
-   appear in the set, will remain 0.0.
+   Every time a set of per-atom lines is read, the value of the atomfile
+   variable for **all** atoms is first initialized to 0.0.  Thus values
+   for atoms whose ID do not appear in the set in the file will remain
+   at 0.0.
+
+Below is a small example for the atomfile variable file format:
+
+ .. parsed-literal::
+
+   # first set
+   4
+   # atom-ID value
+   3 1
+   4 -4
+   1 0.5
+   2 -0.5
+
+   # second set
+   2
+
+   2  1.0
+   4 -1.0

 ----------

@ -1174,12 +1188,17 @@ custom atom properties are the same; just replace the leading "i" with

 +--------+---------------+------------------------------------------+
 | equal  | i_name[I]     | element of per-atom vector (I = atom ID) |
+--------+---------------+------------------------------------------+
 | equal  | i2_name[I][J] | element of per-atom array (I = atom ID)  |
 +--------+---------------+------------------------------------------+
+--------+---------------+------------------------------------------+
 | vector | i_name[I]     | element of per-atom vector (I = atom ID) |
+--------+---------------+------------------------------------------+
 | vector | i2_name[I][J] | element of per-atom array (I = atom ID)  |
 +--------+---------------+------------------------------------------+
+--------+---------------+------------------------------------------+
 | atom   | i_name        | per-atom vector                          |
+--------+---------------+------------------------------------------+
 | atom   | i2_name[I]    | column of per-atom array                 |
 +--------+---------------+------------------------------------------+

@ -1222,15 +1241,23 @@ table:

 +--------+------------+------------------------------------------+
 | equal  | c_ID       | global scalar                            |
+--------+------------+------------------------------------------+
 | equal  | c_ID[I]    | element of global vector                 |
+--------+------------+------------------------------------------+
 | equal  | c_ID[I][J] | element of global array                  |
+--------+------------+------------------------------------------+
 | equal  | C_ID[I]    | element of per-atom vector (I = atom ID) |
+--------+------------+------------------------------------------+
 | equal  | C_ID[I][J] | element of per-atom array (I = atom ID)  |
 +--------+------------+------------------------------------------+
+--------+------------+------------------------------------------+
 | vector | c_ID       | global vector                            |
+--------+------------+------------------------------------------+
 | vector | c_ID[I]    | column of global array                   |
 +--------+------------+------------------------------------------+
+--------+------------+------------------------------------------+
 | atom   | c_ID       | per-atom vector                          |
+--------+------------+------------------------------------------+
 | atom   | c_ID[I]    | column of per-atom array                 |
 +--------+------------+------------------------------------------+

@ -1286,15 +1313,23 @@ and atom-style variables are listed in the following table:

 +--------+------------+------------------------------------------+
 | equal  | f_ID       | global scalar                            |
+--------+------------+------------------------------------------+
 | equal  | f_ID[I]    | element of global vector                 |
+--------+------------+------------------------------------------+
 | equal  | f_ID[I][J] | element of global array                  |
+--------+------------+------------------------------------------+
 | equal  | F_ID[I]    | element of per-atom vector (I = atom ID) |
+--------+------------+------------------------------------------+
 | equal  | F_ID[I][J] | element of per-atom array (I = atom ID)  |
 +--------+------------+------------------------------------------+
+--------+------------+------------------------------------------+
 | vector | f_ID       | global vector                            |
+--------+------------+------------------------------------------+
 | vector | f_ID[I]    | column of global array                   |
 +--------+------------+------------------------------------------+
+--------+------------+------------------------------------------+
 | atom   | f_ID       | per-atom vector                          |
+--------+------------+------------------------------------------+
 | atom   | f_ID[I]    | column of per-atom array                 |
 +--------+------------+------------------------------------------+

@ -1365,17 +1400,27 @@ per-atom vector.

 +--------+-----------+-----------------------------------------------------------------------------------+
 | equal  | v_name    | global scalar from an equal-style variable                                        |
+--------+-----------+-----------------------------------------------------------------------------------+
 | equal  | v_name[I] | element of global vector from a vector-style variable                             |
+--------+-----------+-----------------------------------------------------------------------------------+
 | equal  | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
 +--------+-----------+-----------------------------------------------------------------------------------+
+--------+-----------+-----------------------------------------------------------------------------------+
 | vector | v_name    | global scalar from an equal-style variable                                        |
+--------+-----------+-----------------------------------------------------------------------------------+
 | vector | v_name    | global vector from a vector-style variable                                        |
+--------+-----------+-----------------------------------------------------------------------------------+
 | vector | v_name[I] | element of global vector from a vector-style variable                             |
+--------+-----------+-----------------------------------------------------------------------------------+
 | vector | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
 +--------+-----------+-----------------------------------------------------------------------------------+
+--------+-----------+-----------------------------------------------------------------------------------+
 | atom   | v_name    | global scalar from an equal-style variable                                        |
+--------+-----------+-----------------------------------------------------------------------------------+
 | atom   | v_name    | per-atom vector from an atom-style or atomfile-style variable                     |
+--------+-----------+-----------------------------------------------------------------------------------+
 | atom   | v_name[I] | element of global vector from a vector-style variable                             |
+--------+-----------+-----------------------------------------------------------------------------------+
 | atom   | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
 +--------+-----------+-----------------------------------------------------------------------------------+

--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -2043,6 +2043,7 @@ Makefiles
 makelist
 makepkg
 Makse
+Malavasi
 malloc
 Malolepsza
 Manby
@ -2152,6 +2153,7 @@ membered
 memcheck
 Mendelev
 Menon
+Menziani
 mer
 Meremianin
 Mersenne
@ -2775,6 +2777,8 @@ Peachey
 peachpuff
 Pearlman
 Pedersen
+pedone
+Pedone
 peID
 PEigenDense
 Peng
--- a/examples/PACKAGES/pedone/in.pedone.melt
+++ b/examples/PACKAGES/pedone/in.pedone.melt
@ -0,0 +1,38 @@
+# Ca-O melt with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+create_atoms 1 box
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+create_atoms 2 box
+
+mass 1 40.078
+mass 2 15.999
+
+set type 1 charge 1.2
+set type 2 charge -1.2
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+velocity all create 6000.0 98347
+
+fix 1 all nvt temp 3000.0 3000.0 0.1
+
+# dump 1 all atom 500 Ca-O-melt.lammpstrj
+
+thermo 100
+run 1000
--- a/examples/PACKAGES/pedone/in.pedone.relax
+++ b/examples/PACKAGES/pedone/in.pedone.relax
@ -0,0 +1,38 @@
+# Ca-O crystal with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+create_atoms 1 box
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+create_atoms 2 box
+
+mass 1 40.078
+mass 2 15.999
+
+displace_atoms all random 0.01 0.01 0.01 9084544
+set type 1 charge 1.2
+set type 2 charge -1.2
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+variable len equal lx*0.25
+thermo_style custom step v_len lx pe press
+thermo 100
+fix 1 all box/relax iso 0.0
+minimize 0.0 0.0 1000 10000
+
+print "Expected lattice parameter: 4.7748,  computed: $(v_len:%6.4f)"
--- a/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.1
+++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.1
@ -0,0 +1,122 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd)
+  using 1 OpenMP thread(s) per MPI task
+# Ca-O melt with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+Created orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+create_atoms 2 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+mass 1 40.078
+mass 2 15.999
+
+set type 1 charge 1.2
+Setting atom values ...
+  256 settings made for charge
+set type 2 charge -1.2
+Setting atom values ...
+  256 settings made for charge
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+velocity all create 6000.0 98347
+
+fix 1 all nvt temp 3000.0 3000.0 0.1
+
+# dump 1 all atom 500 Ca-O-melt.lammpstrj
+
+thermo 100
+run 1000
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.23676226
+  grid = 24 24 24
+  stencil order = 5
+  estimated absolute RMS force accuracy = 1.3089053e-05
+  estimated relative force accuracy = 9.089844e-07
+  using double precision FFTW3
+  3d grid and FFT values/proc = 29791 13824
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 3 3 3
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair pedone, perpetual, skip from (2)
+      attributes: half, newton on
+      pair build: skip
+      stencil: none
+      bin: none
+  (2) pair coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 9.239 | 9.239 | 9.239 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   6000          -3771.5568      0             -3375.2452      34213.185    
+       100   2894.1756     -3562.491       0             -3371.3251      114640.32    
+       200   2980.3531     -3570.2657      0             -3373.4076      123673.56    
+       300   2783.0437     -3574.5809      0             -3390.7554      119791.27    
+       400   3021.6581     -3568.2149      0             -3368.6285      116032.29    
+       500   3112.0438     -3580.0178      0             -3374.4613      114798.18    
+       600   2973.4609     -3577.0582      0             -3380.6553      111843.46    
+       700   3180.1687     -3568.4542      0             -3358.3979      121008.83    
+       800   2923.7803     -3573.3023      0             -3380.181       111459.55    
+       900   2940.3133     -3572.1322      0             -3377.9188      118177.36    
+      1000   3070.2584     -3575.5655      0             -3372.769       114175.52    
+Loop time of 13.683 on 1 procs for 1000 steps with 512 atoms
+
+Performance: 12.629 ns/day, 1.900 hours/ns, 73.084 timesteps/s, 37.419 katom-step/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 11.545     | 11.545     | 11.545     |   0.0 | 84.37
+Kspace  | 1.4121     | 1.4121     | 1.4121     |   0.0 | 10.32
+Neigh   | 0.65265    | 0.65265    | 0.65265    |   0.0 |  4.77
+Comm    | 0.056036   | 0.056036   | 0.056036   |   0.0 |  0.41
+Output  | 0.00022945 | 0.00022945 | 0.00022945 |   0.0 |  0.00
+Modify  | 0.0090252  | 0.0090252  | 0.0090252  |   0.0 |  0.07
+Other   |            | 0.00801    |            |       |  0.06
+
+Nlocal:            512 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:          10901 ave       10901 max       10901 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         374419 ave      374419 max      374419 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 374419
+Ave neighs/atom = 731.28711
+Neighbor list builds = 71
+Dangerous builds = 0
+Total wall time: 0:00:13
--- a/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.4
+++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.melt.g++.4
@ -0,0 +1,122 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd)
+  using 1 OpenMP thread(s) per MPI task
+# Ca-O melt with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+Created orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+create_atoms 2 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+mass 1 40.078
+mass 2 15.999
+
+set type 1 charge 1.2
+Setting atom values ...
+  256 settings made for charge
+set type 2 charge -1.2
+Setting atom values ...
+  256 settings made for charge
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+velocity all create 6000.0 98347
+
+fix 1 all nvt temp 3000.0 3000.0 0.1
+
+# dump 1 all atom 500 Ca-O-melt.lammpstrj
+
+thermo 100
+run 1000
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.23676226
+  grid = 24 24 24
+  stencil order = 5
+  estimated absolute RMS force accuracy = 1.3089053e-05
+  estimated relative force accuracy = 9.089844e-07
+  using double precision FFTW3
+  3d grid and FFT values/proc = 11191 3456
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 3 3 3
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair pedone, perpetual, skip from (2)
+      attributes: half, newton on
+      pair build: skip
+      stencil: none
+      bin: none
+  (2) pair coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.315 | 5.315 | 5.315 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   6000          -3771.5568      0             -3375.2452      34213.185    
+       100   3050.0106     -3571.4712      0             -3370.0121      118480.04    
+       200   3100.0073     -3571.2534      0             -3366.492       120618.37    
+       300   2959.7127     -3580.0883      0             -3384.5935      109184.72    
+       400   2922.7083     -3563.9803      0             -3370.9298      120165.71    
+       500   3145.0439     -3571.3828      0             -3363.6465      115057.51    
+       600   2741.7439     -3563.5077      0             -3382.4102      115504.31    
+       700   2906.3636     -3567.3604      0             -3375.3895      119518.5     
+       800   2995.3864     -3567.3838      0             -3369.5327      117975.22    
+       900   2965.24       -3565.7983      0             -3369.9385      123362.35    
+      1000   2916.6485     -3578.7471      0             -3386.0968      115624.78    
+Loop time of 4.50395 on 4 procs for 1000 steps with 512 atoms
+
+Performance: 38.366 ns/day, 0.626 hours/ns, 222.028 timesteps/s, 113.678 katom-step/s
+99.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.2703     | 3.2983     | 3.3259     |   1.3 | 73.23
+Kspace  | 0.79815    | 0.82633    | 0.85342    |   2.6 | 18.35
+Neigh   | 0.18328    | 0.18398    | 0.18472    |   0.1 |  4.08
+Comm    | 0.17423    | 0.17508    | 0.17592    |   0.2 |  3.89
+Output  | 0.00019336 | 0.0002167  | 0.00028554 |   0.0 |  0.00
+Modify  | 0.0089842  | 0.0091093  | 0.0092205  |   0.1 |  0.20
+Other   |            | 0.01096    |            |       |  0.24
+
+Nlocal:            128 ave         143 max         118 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+Nghost:        7622.75 ave        7651 max        7598 min
+Histogram: 1 0 0 1 1 0 0 0 0 1
+Neighs:        93581.8 ave      106456 max       84898 min
+Histogram: 1 1 0 0 1 0 0 0 0 1
+
+Total # of neighbors = 374327
+Ave neighs/atom = 731.10742
+Neighbor list builds = 71
+Dangerous builds = 0
+Total wall time: 0:00:04
--- a/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.1
+++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.1
@ -0,0 +1,134 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd)
+  using 1 OpenMP thread(s) per MPI task
+# Ca-O crystal with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+Created orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  1 by 1 by 1 MPI processor grid
+create_atoms 1 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+create_atoms 2 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+mass 1 40.078
+mass 2 15.999
+
+displace_atoms all random 0.01 0.01 0.01 9084544
+Displacing atoms ...
+set type 1 charge 1.2
+Setting atom values ...
+  256 settings made for charge
+set type 2 charge -1.2
+Setting atom values ...
+  256 settings made for charge
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+variable len equal lx*0.25
+thermo_style custom step v_len lx pe press
+thermo 100
+fix 1 all box/relax iso 0.0
+minimize 0.0 0.0 1000 10000
+Switching to 'neigh_modify every 1 delay 0 check yes' setting during minimization
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.23676226
+  grid = 24 24 24
+  stencil order = 5
+  estimated absolute RMS force accuracy = 1.3089053e-05
+  estimated relative force accuracy = 9.089844e-07
+  using double precision FFTW3
+  3d grid and FFT values/proc = 29791 13824
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 3 3 3
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair pedone, perpetual, skip from (2)
+      attributes: half, newton on
+      pair build: skip
+      stencil: none
+      bin: none
+  (2) pair coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+WARNING: Energy due to 1 extra global DOFs will be included in minimizer energies
+ (src/min.cpp:219)
+Per MPI rank memory allocation (min/avg/max) = 10.33 | 10.33 | 10.33 Mbytes
+   Step         v_len            Lx           PotEng         Press     
+         0   4.8105         19.242        -3765.9116     -21299.914    
+       100   4.7797128      19.118851     -3767.814      -164.13101    
+       200   4.7787507      19.115003     -3769.1366     -373.58797    
+       300   4.7768265      19.107306     -3770.5634      48.944709    
+       400   4.7768265      19.107306     -3770.9879     -258.56116    
+       500   4.7758644      19.103458     -3771.3898      173.91894    
+       600   4.7758644      19.103458     -3771.7586     -91.813678    
+       700   4.7758644      19.103458     -3771.9842     -252.52883    
+       800   4.7749023      19.099609     -3772.3526      216.83318    
+       857   4.7747927      19.099171     -3772.8223      32.586251    
+Loop time of 18.0592 on 1 procs for 857 steps with 512 atoms
+
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = linesearch alpha is zero
+  Energy initial, next-to-last, final = 
+     -3765.91161156884  -3772.82226663623  -3772.82226663623
+  Force two-norm initial, final = 284.3967 0.46963871
+  Force max component initial, final = 284.14458 0.42827677
+  Final line search alpha, max atom move = 2.8580337e-08 1.2240294e-08
+  Iterations, force evaluations = 857 894
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 13.907     | 13.907     | 13.907     |   0.0 | 77.01
+Kspace  | 1.3809     | 1.3809     | 1.3809     |   0.0 |  7.65
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0.045871   | 0.045871   | 0.045871   |   0.0 |  0.25
+Output  | 0.0002809  | 0.0002809  | 0.0002809  |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.726      |            |       | 15.09
+
+Nlocal:            512 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:          11655 ave       11655 max       11655 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         372155 ave      372155 max      372155 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 372155
+Ave neighs/atom = 726.86523
+Neighbor list builds = 0
+Dangerous builds = 0
+
+print "Expected lattice parameter: 4.7748,  computed: $(v_len:%6.4f)"
+Expected lattice parameter: 4.7748,  computed: 4.7748
+Total wall time: 0:00:18
--- a/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.4
+++ b/examples/PACKAGES/pedone/log.9Apr24.pedone.relax.g++.4
@ -0,0 +1,134 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-373-g7ac84e18dd)
+  using 1 OpenMP thread(s) per MPI task
+# Ca-O crystal with Pedone potential
+
+units metal
+atom_style charge
+
+lattice fcc 4.8105  # experimental lattice parameter for fcc-lattice Ca cations
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+
+region box block 0 4 0 4 0 4
+create_box 2 box
+Created orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  1 by 2 by 2 MPI processor grid
+create_atoms 1 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+lattice fcc 4.8105 origin 0.5 0.5 0.5 # O anion lattice shifted by half a unit cell
+Lattice spacing in x,y,z = 4.8105 4.8105 4.8105
+create_atoms 2 box
+Created 256 atoms
+  using lattice units in orthogonal box = (0 0 0) to (19.242 19.242 19.242)
+  create_atoms CPU = 0.000 seconds
+
+mass 1 40.078
+mass 2 15.999
+
+displace_atoms all random 0.01 0.01 0.01 9084544
+Displacing atoms ...
+set type 1 charge 1.2
+Setting atom values ...
+  256 settings made for charge
+set type 2 charge -1.2
+Setting atom values ...
+  256 settings made for charge
+
+timestep 0.002
+neigh_modify delay 5 every 1 check yes
+
+pair_style hybrid/overlay pedone 15.0 coul/long 15.0
+kspace_style pppm 1.0e-6
+
+pair_coeff * * coul/long
+pair_coeff 1 2 pedone 0.030211 2.241334 2.923245 5.0
+pair_coeff 2 2 pedone 0.042395 1.379316 3.618701 22.0
+
+variable len equal lx*0.25
+thermo_style custom step v_len lx pe press
+thermo 100
+fix 1 all box/relax iso 0.0
+minimize 0.0 0.0 1000 10000
+Switching to 'neigh_modify every 1 delay 0 check yes' setting during minimization
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:342)
+  G vector (1/distance) = 0.23676226
+  grid = 24 24 24
+  stencil order = 5
+  estimated absolute RMS force accuracy = 1.3089053e-05
+  estimated relative force accuracy = 9.089844e-07
+  using double precision FFTW3
+  3d grid and FFT values/proc = 11191 3456
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 3 3 3
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair pedone, perpetual, skip from (2)
+      attributes: half, newton on
+      pair build: skip
+      stencil: none
+      bin: none
+  (2) pair coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+WARNING: Energy due to 1 extra global DOFs will be included in minimizer energies
+ (src/min.cpp:219)
+Per MPI rank memory allocation (min/avg/max) = 6.44 | 6.44 | 6.44 Mbytes
+   Step         v_len            Lx           PotEng         Press     
+         0   4.8105         19.242        -3765.9116     -21299.914    
+       100   4.7797128      19.118851     -3767.814      -164.13101    
+       200   4.7787507      19.115003     -3769.1367     -373.59489    
+       300   4.7768265      19.107306     -3770.5868      32.046893    
+       400   4.7768265      19.107306     -3771.0322     -290.69703    
+       500   4.7758644      19.103458     -3771.4223      150.34606    
+       600   4.7758644      19.103458     -3771.7941     -117.26938    
+       700   4.7758644      19.103458     -3772.0193     -277.34372    
+       800   4.7749023      19.099609     -3772.42        171.95177    
+       860   4.7748339      19.099336     -3772.8237      1.0976356    
+Loop time of 5.65601 on 4 procs for 860 steps with 512 atoms
+
+99.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = linesearch alpha is zero
+  Energy initial, next-to-last, final = 
+     -3765.91161156888  -3772.82365446552  -3772.82365446552
+  Force two-norm initial, final = 284.3967 0.067746634
+  Force max component initial, final = 284.14458 0.014426328
+  Final line search alpha, max atom move = 1.9073486e-06 2.7516038e-08
+  Iterations, force evaluations = 860 922
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.7408     | 3.8442     | 4.0543     |   6.5 | 67.97
+Kspace  | 0.60187    | 0.81211    | 0.91543    |  14.1 | 14.36
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0.14969    | 0.15017    | 0.15071    |   0.1 |  2.66
+Output  | 0.00019203 | 0.00020711 | 0.0002511  |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.8494     |            |       | 15.02
+
+Nlocal:            128 ave         135 max         123 min
+Histogram: 1 0 1 0 1 0 0 0 0 1
+Nghost:           8175 ave        8180 max        8168 min
+Histogram: 1 0 0 0 0 1 0 1 0 1
+Neighs:        93038.8 ave       98164 max       89373 min
+Histogram: 1 0 1 0 1 0 0 0 0 1
+
+Total # of neighbors = 372155
+Ave neighs/atom = 726.86523
+Neighbor list builds = 0
+Dangerous builds = 0
+
+print "Expected lattice parameter: 4.7748,  computed: $(v_len:%6.4f)"
+Expected lattice parameter: 4.7748,  computed: 4.7748
+Total wall time: 0:00:05
--- a/examples/micelle/log.29Mar2019.micelle-rigid.g++.1
+++ b/examples/micelle/log.29Mar2019.micelle-rigid.g++.1
@ -1,260 +0,0 @@
-LAMMPS (29 Mar 2019)
-  using 1 OpenMP thread(s) per MPI task
-# 2d micelle simulation
-
-dimension	2
-
-neighbor	0.3 bin
-neigh_modify	delay 5
-
-atom_style	bond
-
-# Soft potential push-off
-
-read_data	data.micelle
-  orthogonal box = (0 0 -0.1) to (35.8569 35.8569 0.1)
-  1 by 1 by 1 MPI processor grid
-  reading atoms ...
-  1200 atoms
-  scanning bonds ...
-  1 = max bonds/atom
-  reading bonds ...
-  300 bonds
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  1 = max # of 1-4 neighbors
-  2 = max # of special neighbors
-  special bonds CPU = 0.000473022 secs
-  read_data CPU = 0.0024147 secs
-special_bonds	fene
-  2 = max # of 1-2 neighbors
-  2 = max # of special neighbors
-  special bonds CPU = 0.00022316 secs
-
-pair_style	soft 1.12246
-pair_coeff	* * 0.0 1.12246
-
-bond_style 	harmonic
-bond_coeff	1 50.0 0.75
-
-velocity	all create 0.45 2349852
-
-variable	prefactor equal ramp(1.0,20.0)
-
-fix		1 all nve
-fix		2 all temp/rescale 100 0.45 0.45 0.02 1.0
-fix		3 all adapt 1 pair soft a * * v_prefactor
-fix		4 all enforce2d
-
-thermo		50
-run		500
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 1.42246
-  ghost atom cutoff = 1.42246
-  binsize = 0.71123, bins = 51 51 1
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair soft, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/2d/newton
-      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 3.799 | 3.799 | 3.799 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0         0.45   0.40003481 2.2200223e-06   0.84966203   0.78952518 
-      50   0.54981866   0.93548899  0.068440043    1.5532895    1.9232786 
-     100         0.45   0.99659327  0.079228519    1.5254468    3.2135679 
-     150   0.86965411   0.90456016   0.07493355    1.8484231    4.3821925 
-     200         0.45      1.01454   0.10663502       1.5708    4.7598476 
-     250   0.79636561   0.82567712   0.12105337    1.7424325    5.4983899 
-     300         0.45   0.86475538   0.11819875    1.4325791    5.8554758 
-     350   0.72135464   0.70693069   0.10912636    1.5368106    6.0388247 
-     400         0.45   0.75067331   0.14165013    1.3419484    6.3840708 
-     450   0.64839221   0.62402486   0.14173679    1.4136135    6.4791009 
-     500         0.45   0.66669513   0.13695201    1.2532721     6.807146 
-Loop time of 0.103162 on 1 procs for 500 steps with 1200 atoms
-
-Performance: 2093802.885 tau/day, 4846.766 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.068308   | 0.068308   | 0.068308   |   0.0 | 66.21
-Bond    | 0.004235   | 0.004235   | 0.004235   |   0.0 |  4.11
-Neigh   | 0.014069   | 0.014069   | 0.014069   |   0.0 | 13.64
-Comm    | 0.0019219  | 0.0019219  | 0.0019219  |   0.0 |  1.86
-Output  | 0.00017262 | 0.00017262 | 0.00017262 |   0.0 |  0.17
-Modify  | 0.011728   | 0.011728   | 0.011728   |   0.0 | 11.37
-Other   |            | 0.002726   |            |       |  2.64
-
-Nlocal:    1200 ave 1200 max 1200 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    197 ave 197 max 197 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    3094 ave 3094 max 3094 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 3094
-Ave neighs/atom = 2.57833
-Ave special neighs/atom = 0.5
-Neighbor list builds = 52
-Dangerous builds = 0
-
-unfix		3
-
-# Main run
-
-pair_style	lj/cut 2.5
-
-# solvent/head - full-size and long-range
-
-pair_coeff	1 1 1.0 1.0 2.5
-pair_coeff	2 2 1.0 1.0 2.5
-pair_coeff	1 2 1.0 1.0 2.5
-
-# tail/tail - size-averaged and long-range
-
-pair_coeff	3 3 1.0 0.75 2.5
-pair_coeff	4 4 1.0 0.50 2.5
-pair_coeff	3 4 1.0 0.67 2.5
-
-# solvent/tail - full-size and repulsive
-
-pair_coeff	1 3 1.0 1.0 1.12246
-pair_coeff	1 4 1.0 1.0 1.12246
-
-# head/tail - size-averaged and repulsive
-
-pair_coeff	2 3 1.0 0.88 1.12246
-pair_coeff	2 4 1.0 0.75 1.12246
-
-thermo		50
-
-#dump		1 all atom 2000 dump.micelle
-
-#dump		2 all image 2000 image.*.jpg type type zoom 1.6
-#dump_modify	2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
-
-#dump		3 all movie 2000 movie.mpg type type zoom 1.6
-#dump_modify	3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
-
-reset_timestep	0
-group solvent   molecule 0
-750 atoms in group solvent
-group solute    subtract all solvent
-450 atoms in group solute
-unfix 1
-unfix 2
-unfix 4
-fix		1 solvent nve
-fix		2 solvent temp/rescale 100 0.45 0.45 0.02 1.0
-fix		5 solute rigid molecule langevin 0.45 0.45 0.5 112211
-150 rigid bodies with 450 atoms
-fix		4 all enforce2d
-run		500
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 2.8
-  ghost atom cutoff = 2.8
-  binsize = 1.4, bins = 26 26 1
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/cut, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/2d/newton
-      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 5.274 | 5.274 | 5.274 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0   0.45318168   -1.3753652   0.13695201   -0.8705807     1.975423 
-      50   0.77871641   -1.6955252   0.13695201  -0.92651507   0.64222539 
-     100    0.5336062   -1.7124572   0.13695201   -1.1423948  -0.11959696 
-     150   0.58789067   -1.7926109   0.13695201   -1.1784877    1.2592743 
-     200   0.47864796   -1.8040298   0.13695201   -1.2785752    3.6739793 
-     250   0.51124651   -1.8614797   0.13695201    -1.309566    2.5817722 
-     300   0.45695639   -1.8708384   0.13695201   -1.3629901    3.0833794 
-     350     0.477504   -1.8924359   0.13695201   -1.3679098   -5.1605926 
-     400   0.45328205     -1.87754   0.13695201    -1.372674   -4.0355858 
-     450   0.47465031   -1.9071924   0.13695201   -1.3849826    3.1949617 
-     500   0.45533691   -1.9072316   0.13695201   -1.4006978   0.48079061 
-Loop time of 0.178806 on 1 procs for 500 steps with 1200 atoms
-
-Performance: 1208012.705 tau/day, 2796.326 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.086131   | 0.086131   | 0.086131   |   0.0 | 48.17
-Bond    | 0.0042472  | 0.0042472  | 0.0042472  |   0.0 |  2.38
-Neigh   | 0.021317   | 0.021317   | 0.021317   |   0.0 | 11.92
-Comm    | 0.0025985  | 0.0025985  | 0.0025985  |   0.0 |  1.45
-Output  | 0.000175   | 0.000175   | 0.000175   |   0.0 |  0.10
-Modify  | 0.061408   | 0.061408   | 0.061408   |   0.0 | 34.34
-Other   |            | 0.00293    |            |       |  1.64
-
-Nlocal:    1200 ave 1200 max 1200 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    416 ave 416 max 416 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    8769 ave 8769 max 8769 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 8769
-Ave neighs/atom = 7.3075
-Ave special neighs/atom = 0.5
-Neighbor list builds = 47
-Dangerous builds = 2
-unfix 2
-unfix 4
-unfix 5
-fix		5 solute rigid/small molecule
-  create bodies CPU = 0.00015378 secs
-150 rigid bodies with 450 atoms
-  1.30435 = max distance from body owner to body atom
-fix		4 all enforce2d
-run		500
-Per MPI rank memory allocation (min/avg/max) = 8.64 | 8.64 | 8.64 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-     500   0.45533691   -1.9072316   0.13695201   -1.4006978    2.4545793 
-     550   0.45627282    -1.912409   0.13695201   -1.4051155    2.1845065 
-     600   0.44734553   -1.8890695   0.13695201    -1.389022    2.3458965 
-     650   0.46444648   -1.9042462   0.13695201   -1.3903185    2.1609319 
-     700   0.47113236   -1.8977576   0.13695201   -1.3784032    2.2420351 
-     750   0.48554548   -1.9253545   0.13695201   -1.3943015     2.143907 
-     800   0.46350091   -1.8865749   0.13695201   -1.3734146     2.294431 
-     850    0.4766104   -1.9094039   0.13695201   -1.3856031    2.2077157 
-     900   0.48988467   -1.9051538   0.13695201   -1.3705787    2.0107056 
-     950   0.48351943   -1.9162485   0.13695201   -1.3868399    2.1891332 
-    1000   0.49033701   -1.9115165   0.13695201   -1.3765742    2.1508141 
-Loop time of 0.166502 on 1 procs for 500 steps with 1200 atoms
-
-Performance: 1297278.008 tau/day, 3002.958 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.085767   | 0.085767   | 0.085767   |   0.0 | 51.51
-Bond    | 0.0042562  | 0.0042562  | 0.0042562  |   0.0 |  2.56
-Neigh   | 0.018039   | 0.018039   | 0.018039   |   0.0 | 10.83
-Comm    | 0.0024002  | 0.0024002  | 0.0024002  |   0.0 |  1.44
-Output  | 0.00018239 | 0.00018239 | 0.00018239 |   0.0 |  0.11
-Modify  | 0.052717   | 0.052717   | 0.052717   |   0.0 | 31.66
-Other   |            | 0.003141   |            |       |  1.89
-
-Nlocal:    1200 ave 1200 max 1200 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    415 ave 415 max 415 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    8743 ave 8743 max 8743 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 8743
-Ave neighs/atom = 7.28583
-Ave special neighs/atom = 0.5
-Neighbor list builds = 40
-Dangerous builds = 0
-Total wall time: 0:00:00
--- a/examples/micelle/log.29Mar2019.micelle-rigid.g++.4
+++ b/examples/micelle/log.29Mar2019.micelle-rigid.g++.4
@ -1,260 +0,0 @@
-LAMMPS (29 Mar 2019)
-  using 1 OpenMP thread(s) per MPI task
-# 2d micelle simulation
-
-dimension	2
-
-neighbor	0.3 bin
-neigh_modify	delay 5
-
-atom_style	bond
-
-# Soft potential push-off
-
-read_data	data.micelle
-  orthogonal box = (0 0 -0.1) to (35.8569 35.8569 0.1)
-  2 by 2 by 1 MPI processor grid
-  reading atoms ...
-  1200 atoms
-  scanning bonds ...
-  1 = max bonds/atom
-  reading bonds ...
-  300 bonds
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  1 = max # of 1-4 neighbors
-  2 = max # of special neighbors
-  special bonds CPU = 0.000422001 secs
-  read_data CPU = 0.00473404 secs
-special_bonds	fene
-  2 = max # of 1-2 neighbors
-  2 = max # of special neighbors
-  special bonds CPU = 0.000183344 secs
-
-pair_style	soft 1.12246
-pair_coeff	* * 0.0 1.12246
-
-bond_style 	harmonic
-bond_coeff	1 50.0 0.75
-
-velocity	all create 0.45 2349852
-
-variable	prefactor equal ramp(1.0,20.0)
-
-fix		1 all nve
-fix		2 all temp/rescale 100 0.45 0.45 0.02 1.0
-fix		3 all adapt 1 pair soft a * * v_prefactor
-fix		4 all enforce2d
-
-thermo		50
-run		500
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 1.42246
-  ghost atom cutoff = 1.42246
-  binsize = 0.71123, bins = 51 51 1
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair soft, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/2d/newton
-      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 3.758 | 3.85 | 4.126 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0         0.45   0.40003481 2.2200223e-06   0.84966203   0.78952518 
-      50   0.54981866   0.93548899  0.068440043    1.5532895    1.9232786 
-     100         0.45   0.99659327  0.079228519    1.5254468    3.2135679 
-     150   0.86965411   0.90456016   0.07493355    1.8484231    4.3821925 
-     200         0.45      1.01454   0.10663502       1.5708    4.7598476 
-     250   0.79636561   0.82567712   0.12105337    1.7424325    5.4983899 
-     300         0.45   0.86475538   0.11819875    1.4325791    5.8554758 
-     350   0.72135464   0.70693069   0.10912636    1.5368106    6.0388247 
-     400         0.45   0.75067331   0.14165013    1.3419484    6.3840708 
-     450   0.64839221   0.62402486   0.14173679    1.4136135    6.4791009 
-     500         0.45   0.66669513   0.13695201    1.2532721     6.807146 
-Loop time of 0.0426326 on 4 procs for 500 steps with 1200 atoms
-
-Performance: 5066547.720 tau/day, 11728.120 timesteps/s
-98.7% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.016784   | 0.019254   | 0.022154   |   1.5 | 45.16
-Bond    | 0.0010612  | 0.0012558  | 0.0014153  |   0.4 |  2.95
-Neigh   | 0.0046048  | 0.0046697  | 0.0047245  |   0.1 | 10.95
-Comm    | 0.0064592  | 0.0097114  | 0.012527   |   2.4 | 22.78
-Output  | 0.00022507 | 0.00026393 | 0.00033951 |   0.0 |  0.62
-Modify  | 0.0041659  | 0.0048084  | 0.0053945  |   0.8 | 11.28
-Other   |            | 0.002669   |            |       |  6.26
-
-Nlocal:    300 ave 304 max 292 min
-Histogram: 1 0 0 0 0 0 0 0 2 1
-Nghost:    103.5 ave 108 max 98 min
-Histogram: 1 0 0 1 0 0 0 0 0 2
-Neighs:    773.5 ave 792 max 735 min
-Histogram: 1 0 0 0 0 0 0 0 2 1
-
-Total # of neighbors = 3094
-Ave neighs/atom = 2.57833
-Ave special neighs/atom = 0.5
-Neighbor list builds = 52
-Dangerous builds = 0
-
-unfix		3
-
-# Main run
-
-pair_style	lj/cut 2.5
-
-# solvent/head - full-size and long-range
-
-pair_coeff	1 1 1.0 1.0 2.5
-pair_coeff	2 2 1.0 1.0 2.5
-pair_coeff	1 2 1.0 1.0 2.5
-
-# tail/tail - size-averaged and long-range
-
-pair_coeff	3 3 1.0 0.75 2.5
-pair_coeff	4 4 1.0 0.50 2.5
-pair_coeff	3 4 1.0 0.67 2.5
-
-# solvent/tail - full-size and repulsive
-
-pair_coeff	1 3 1.0 1.0 1.12246
-pair_coeff	1 4 1.0 1.0 1.12246
-
-# head/tail - size-averaged and repulsive
-
-pair_coeff	2 3 1.0 0.88 1.12246
-pair_coeff	2 4 1.0 0.75 1.12246
-
-thermo		50
-
-#dump		1 all atom 2000 dump.micelle
-
-#dump		2 all image 2000 image.*.jpg type type zoom 1.6
-#dump_modify	2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
-
-#dump		3 all movie 2000 movie.mpg type type zoom 1.6
-#dump_modify	3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
-
-reset_timestep	0
-group solvent   molecule 0
-750 atoms in group solvent
-group solute    subtract all solvent
-450 atoms in group solute
-unfix 1
-unfix 2
-unfix 4
-fix		1 solvent nve
-fix		2 solvent temp/rescale 100 0.45 0.45 0.02 1.0
-fix		5 solute rigid molecule langevin 0.45 0.45 0.5 112211
-150 rigid bodies with 450 atoms
-fix		4 all enforce2d
-run		500
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 2.8
-  ghost atom cutoff = 2.8
-  binsize = 1.4, bins = 26 26 1
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/cut, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/2d/newton
-      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 5.251 | 5.282 | 5.374 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0   0.45318168   -1.3753652   0.13695201   -0.8705807     1.975423 
-      50   0.77871641   -1.6955252   0.13695201  -0.92651507   0.64222539 
-     100    0.5336062   -1.7124572   0.13695201   -1.1423948  -0.11959696 
-     150   0.58789067   -1.7926109   0.13695201   -1.1784877    1.2592743 
-     200   0.47864796   -1.8040298   0.13695201   -1.2785752    3.6739793 
-     250   0.51124651   -1.8614797   0.13695201    -1.309566    2.5817722 
-     300   0.45695639   -1.8708384   0.13695201   -1.3629901    3.0833794 
-     350     0.477504   -1.8924359   0.13695201   -1.3679098   -5.1605926 
-     400   0.45328205     -1.87754   0.13695201    -1.372674   -4.0355858 
-     450   0.47465031   -1.9071924   0.13695201   -1.3849826    3.1949617 
-     500   0.45533691   -1.9072316   0.13695201   -1.4006978   0.48079061 
-Loop time of 0.0887392 on 4 procs for 500 steps with 1200 atoms
-
-Performance: 2434100.210 tau/day, 5634.491 timesteps/s
-98.9% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.022611   | 0.022839   | 0.023082   |   0.1 | 25.74
-Bond    | 0.0010793  | 0.0011569  | 0.0012515  |   0.2 |  1.30
-Neigh   | 0.0064609  | 0.0064996  | 0.0065265  |   0.0 |  7.32
-Comm    | 0.0071712  | 0.0073687  | 0.0077734  |   0.3 |  8.30
-Output  | 0.00023389 | 0.00025356 | 0.00030327 |   0.0 |  0.29
-Modify  | 0.047258   | 0.047683   | 0.048503   |   0.2 | 53.73
-Other   |            | 0.002938   |            |       |  3.31
-
-Nlocal:    300 ave 309 max 291 min
-Histogram: 1 0 0 1 0 0 1 0 0 1
-Nghost:    218.75 ave 223 max 216 min
-Histogram: 1 0 2 0 0 0 0 0 0 1
-Neighs:    2192.25 ave 2251 max 2113 min
-Histogram: 1 0 0 1 0 0 0 0 0 2
-
-Total # of neighbors = 8769
-Ave neighs/atom = 7.3075
-Ave special neighs/atom = 0.5
-Neighbor list builds = 47
-Dangerous builds = 2
-unfix 2
-unfix 4
-unfix 5
-fix		5 solute rigid/small molecule
-  create bodies CPU = 7.70092e-05 secs
-150 rigid bodies with 450 atoms
-  1.30435 = max distance from body owner to body atom
-fix		4 all enforce2d
-run		500
-Per MPI rank memory allocation (min/avg/max) = 8.565 | 8.597 | 8.69 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-     500   0.45533691   -1.9072316   0.13695201   -1.4006978    2.4545793 
-     550   0.45627282    -1.912409   0.13695201   -1.4051155    2.1845065 
-     600   0.44734553   -1.8890695   0.13695201    -1.389022    2.3458965 
-     650   0.46444648   -1.9042462   0.13695201   -1.3903185    2.1609319 
-     700   0.47113236   -1.8977576   0.13695201   -1.3784032    2.2420351 
-     750   0.48554548   -1.9253545   0.13695201   -1.3943015     2.143907 
-     800   0.46350091   -1.8865749   0.13695201   -1.3734146     2.294431 
-     850    0.4766104   -1.9094039   0.13695201   -1.3856031    2.2077157 
-     900   0.48988467   -1.9051538   0.13695201   -1.3705787    2.0107056 
-     950   0.48351942   -1.9162485   0.13695201   -1.3868399    2.1891332 
-    1000     0.490337   -1.9115164   0.13695201   -1.3765742    2.1508141 
-Loop time of 0.0588261 on 4 procs for 500 steps with 1200 atoms
-
-Performance: 3671840.233 tau/day, 8499.630 timesteps/s
-98.3% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.022407   | 0.022631   | 0.0229     |   0.1 | 38.47
-Bond    | 0.0010669  | 0.0011355  | 0.0012124  |   0.2 |  1.93
-Neigh   | 0.0052333  | 0.00528    | 0.0053182  |   0.0 |  8.98
-Comm    | 0.0063677  | 0.0066406  | 0.0068488  |   0.2 | 11.29
-Output  | 0.00023055 | 0.00024778 | 0.00028086 |   0.0 |  0.42
-Modify  | 0.020577   | 0.020651   | 0.020834   |   0.1 | 35.11
-Other   |            | 0.00224    |            |       |  3.81
-
-Nlocal:    300 ave 303 max 295 min
-Histogram: 1 0 0 0 0 0 1 0 1 1
-Nghost:    219 ave 224 max 215 min
-Histogram: 1 0 0 1 1 0 0 0 0 1
-Neighs:    2185.75 ave 2244 max 2143 min
-Histogram: 1 1 0 0 0 1 0 0 0 1
-
-Total # of neighbors = 8743
-Ave neighs/atom = 7.28583
-Ave special neighs/atom = 0.5
-Neighbor list builds = 40
-Dangerous builds = 0
-Total wall time: 0:00:00
--- a/examples/micelle/log.4Apr2024.micelle-rigid.g++.1
+++ b/examples/micelle/log.4Apr2024.micelle-rigid.g++.1
@ -0,0 +1,271 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-345-g506bf886ee-modified)
+# 2d micelle simulation
+
+dimension	2
+
+neighbor	0.3 bin
+neigh_modify	delay 5
+
+atom_style	bond
+
+# Soft potential push-off
+
+read_data	data.micelle
+Reading data file ...
+  orthogonal box = (0 0 -0.1) to (35.85686 35.85686 0.1)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  1200 atoms
+  scanning bonds ...
+  1 = max bonds/atom
+  reading bonds ...
+  300 bonds
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     2 = max # of 1-2 neighbors
+     1 = max # of 1-3 neighbors
+     1 = max # of 1-4 neighbors
+     2 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.005 seconds
+special_bonds	fene
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        1        1       
+  special bond factors coul:  0        1        1       
+     2 = max # of 1-2 neighbors
+     2 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+
+pair_style	soft 1.12246
+pair_coeff	* * 0.0 1.12246
+
+bond_style 	harmonic
+bond_coeff	1 50.0 0.75
+
+velocity	all create 0.45 2349852
+
+variable	prefactor equal ramp(1.0,20.0)
+
+fix		1 all nve
+fix		2 all temp/rescale 100 0.45 0.45 0.02 1.0
+fix		3 all adapt 1 pair soft a * * v_prefactor
+fix		4 all enforce2d
+
+thermo		50
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730)
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 1.42246
+  ghost atom cutoff = 1.42246
+  binsize = 0.71123, bins = 51 51 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair soft, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/2d
+      bin: standard
+WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730)
+Per MPI rank memory allocation (min/avg/max) = 4.148 | 4.148 | 4.148 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.45           0.40003481     2.2200223e-06  0.84966203     0.78952518   
+        50   0.54981866     0.93548899     0.068440043    1.5532895      1.9232786    
+       100   0.45           0.99659327     0.079228519    1.5254468      3.2135679    
+       150   0.86965411     0.90456016     0.07493355     1.8484231      4.3821925    
+       200   0.45           1.01454        0.10663502     1.5708         4.7598476    
+       250   0.79636561     0.82567712     0.12105337     1.7424325      5.4983899    
+       300   0.45           0.86475538     0.11819875     1.4325791      5.8554758    
+       350   0.72135464     0.70693069     0.10912636     1.5368106      6.0388247    
+       400   0.45           0.75067331     0.14165013     1.3419484      6.3840708    
+       450   0.64839221     0.62402486     0.14173679     1.4136135      6.4791009    
+       500   0.45           0.66669513     0.13695201     1.2532721      6.807146     
+Loop time of 0.0365221 on 1 procs for 500 steps with 1200 atoms
+
+Performance: 5914221.123 tau/day, 13690.327 timesteps/s, 16.428 Matom-step/s
+89.2% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.022939   | 0.022939   | 0.022939   |   0.0 | 62.81
+Bond    | 0.00073851 | 0.00073851 | 0.00073851 |   0.0 |  2.02
+Neigh   | 0.0078339  | 0.0078339  | 0.0078339  |   0.0 | 21.45
+Comm    | 0.00072134 | 0.00072134 | 0.00072134 |   0.0 |  1.98
+Output  | 7.1419e-05 | 7.1419e-05 | 7.1419e-05 |   0.0 |  0.20
+Modify  | 0.0034868  | 0.0034868  | 0.0034868  |   0.0 |  9.55
+Other   |            | 0.0007314  |            |       |  2.00
+
+Nlocal:           1200 ave        1200 max        1200 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            197 ave         197 max         197 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:           3094 ave        3094 max        3094 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 3094
+Ave neighs/atom = 2.5783333
+Ave special neighs/atom = 0.5
+Neighbor list builds = 52
+Dangerous builds = 0
+
+unfix		3
+
+# Main run
+
+pair_style	lj/cut 2.5
+
+# solvent/head - full-size and long-range
+
+pair_coeff	1 1 1.0 1.0 2.5
+pair_coeff	2 2 1.0 1.0 2.5
+pair_coeff	1 2 1.0 1.0 2.5
+
+# tail/tail - size-averaged and long-range
+
+pair_coeff	3 3 1.0 0.75 2.5
+pair_coeff	4 4 1.0 0.50 2.5
+pair_coeff	3 4 1.0 0.67 2.5
+
+# solvent/tail - full-size and repulsive
+
+pair_coeff	1 3 1.0 1.0 1.12246
+pair_coeff	1 4 1.0 1.0 1.12246
+
+# head/tail - size-averaged and repulsive
+
+pair_coeff	2 3 1.0 0.88 1.12246
+pair_coeff	2 4 1.0 0.75 1.12246
+
+thermo		50
+
+#dump		1 all atom 2000 dump.micelle
+
+#dump		2 all image 2000 image.*.jpg type type zoom 1.6
+#dump_modify	2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
+
+#dump		3 all movie 2000 movie.mpg type type zoom 1.6
+#dump_modify	3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
+
+reset_timestep	0
+group solvent   molecule 0
+750 atoms in group solvent
+group solute    subtract all solvent
+450 atoms in group solute
+unfix 1
+unfix 2
+unfix 4
+fix		1 solvent nve
+fix		2 solvent temp/rescale 100 0.45 0.45 0.02 1.0
+fix		5 solute rigid molecule langevin 0.45 0.45 0.5 112211
+  150 rigid bodies with 450 atoms
+fix		4 all enforce2d
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2.8
+  ghost atom cutoff = 2.8
+  binsize = 1.4, bins = 26 26 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/2d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.391 | 5.391 | 5.391 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.45318168    -1.3753652      0.13695201    -0.8705807      1.975423     
+        50   0.77344732    -1.6944083      0.13695201    -0.92967487     0.58657109   
+       100   0.53530681    -1.7006195      0.13695201    -1.1291768      0.11219772   
+       150   0.60820175    -1.8071581      0.13695201    -1.176549       1.5161796    
+       200   0.49410558    -1.7945459      0.13695201    -1.2565449      4.0469262    
+       250   0.52460847    -1.8528672      0.13695201    -1.290108       2.9929445    
+       300   0.46596803    -1.8680499      0.13695201    -1.3528872      2.7958851    
+       350   0.48831812    -1.8723486      0.13695201    -1.3390451     -4.5106818    
+       400   0.46798432    -1.9008529      0.13695201    -1.3840536     -4.3096566    
+       450   0.46000658    -1.9081144      0.13695201    -1.3977904      3.3360611    
+       500   0.45822409    -1.9077531      0.13695201    -1.3988759      0.45428738   
+Loop time of 0.0650638 on 1 procs for 500 steps with 1200 atoms
+
+Performance: 3319817.322 tau/day, 7684.762 timesteps/s, 9.222 Matom-step/s
+100.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.027565   | 0.027565   | 0.027565   |   0.0 | 42.37
+Bond    | 0.0007043  | 0.0007043  | 0.0007043  |   0.0 |  1.08
+Neigh   | 0.012724   | 0.012724   | 0.012724   |   0.0 | 19.56
+Comm    | 0.00091442 | 0.00091442 | 0.00091442 |   0.0 |  1.41
+Output  | 6.004e-05  | 6.004e-05  | 6.004e-05  |   0.0 |  0.09
+Modify  | 0.022329   | 0.022329   | 0.022329   |   0.0 | 34.32
+Other   |            | 0.0007666  |            |       |  1.18
+
+Nlocal:           1200 ave        1200 max        1200 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            411 ave         411 max         411 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:           8759 ave        8759 max        8759 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 8759
+Ave neighs/atom = 7.2991667
+Ave special neighs/atom = 0.5
+Neighbor list builds = 46
+Dangerous builds = 2
+unfix 2
+unfix 4
+unfix 5
+fix		5 solute rigid/small molecule
+  create bodies CPU = 0.000 seconds
+  150 rigid bodies with 450 atoms
+  1.3043524 = max distance from body owner to body atom
+fix		4 all enforce2d
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.306 | 9.306 | 9.306 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+       500   0.45822409    -1.9077531      0.13695201    -1.3988759      2.4509752    
+       550   0.46736204    -1.9141964      0.13695201    -1.3979022      2.1695662    
+       600   0.47872194    -1.9232781      0.13695201    -1.3977635      2.0058379    
+       650   0.47491575    -1.9224109      0.13695201    -1.3999857      2.0637789    
+       700   0.44714331    -1.8990682      0.13695201    -1.3991848      2.4863082    
+       750   0.49089274    -1.9231004      0.13695201    -1.3877071      2.123147     
+       800   0.4753839     -1.8959698      0.13695201    -1.3731645      2.3030481    
+       850   0.46870816    -1.8972225      0.13695201    -1.3798357      2.2464703    
+       900   0.49610454    -1.9070748      0.13695201    -1.3674513      2.2196388    
+       950   0.4773035     -1.8925765      0.13695201    -1.3682132      2.3534786    
+      1000   0.50413702    -1.9292393      0.13695201    -1.383096       2.1630988    
+Loop time of 0.0592806 on 1 procs for 500 steps with 1200 atoms
+
+Performance: 3643690.276 tau/day, 8434.468 timesteps/s, 10.121 Matom-step/s
+100.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.026866   | 0.026866   | 0.026866   |   0.0 | 45.32
+Bond    | 0.00071863 | 0.00071863 | 0.00071863 |   0.0 |  1.21
+Neigh   | 0.010927   | 0.010927   | 0.010927   |   0.0 | 18.43
+Comm    | 0.00084187 | 0.00084187 | 0.00084187 |   0.0 |  1.42
+Output  | 6.8106e-05 | 6.8106e-05 | 6.8106e-05 |   0.0 |  0.11
+Modify  | 0.019075   | 0.019075   | 0.019075   |   0.0 | 32.18
+Other   |            | 0.000783   |            |       |  1.32
+
+Nlocal:           1200 ave        1200 max        1200 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            417 ave         417 max         417 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:           8654 ave        8654 max        8654 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 8654
+Ave neighs/atom = 7.2116667
+Ave special neighs/atom = 0.5
+Neighbor list builds = 39
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/examples/micelle/log.4Apr2024.micelle-rigid.g++.4
+++ b/examples/micelle/log.4Apr2024.micelle-rigid.g++.4
@ -0,0 +1,272 @@
+LAMMPS (7 Feb 2024 - Development - patch_7Feb2024_update1-345-g506bf886ee-modified)
+WARNING: Using I/O redirection is unreliable with parallel runs. Better to use the -in switch to read input files. (../lammps.cpp:551)
+# 2d micelle simulation
+
+dimension	2
+
+neighbor	0.3 bin
+neigh_modify	delay 5
+
+atom_style	bond
+
+# Soft potential push-off
+
+read_data	data.micelle
+Reading data file ...
+  orthogonal box = (0 0 -0.1) to (35.85686 35.85686 0.1)
+  2 by 2 by 1 MPI processor grid
+  reading atoms ...
+  1200 atoms
+  scanning bonds ...
+  1 = max bonds/atom
+  reading bonds ...
+  300 bonds
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     2 = max # of 1-2 neighbors
+     1 = max # of 1-3 neighbors
+     1 = max # of 1-4 neighbors
+     2 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.004 seconds
+special_bonds	fene
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        1        1       
+  special bond factors coul:  0        1        1       
+     2 = max # of 1-2 neighbors
+     2 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+
+pair_style	soft 1.12246
+pair_coeff	* * 0.0 1.12246
+
+bond_style 	harmonic
+bond_coeff	1 50.0 0.75
+
+velocity	all create 0.45 2349852
+
+variable	prefactor equal ramp(1.0,20.0)
+
+fix		1 all nve
+fix		2 all temp/rescale 100 0.45 0.45 0.02 1.0
+fix		3 all adapt 1 pair soft a * * v_prefactor
+fix		4 all enforce2d
+
+thermo		50
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730)
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 1.42246
+  ghost atom cutoff = 1.42246
+  binsize = 0.71123, bins = 51 51 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair soft, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/2d
+      bin: standard
+WARNING: Communication cutoff 1.42246 is shorter than a bond length based estimate of 1.425. This may lead to errors. (../comm.cpp:730)
+Per MPI rank memory allocation (min/avg/max) = 4.126 | 4.126 | 4.127 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.45           0.40003481     2.2200223e-06  0.84966203     0.78952518   
+        50   0.54981866     0.93548899     0.068440043    1.5532895      1.9232786    
+       100   0.45           0.99659327     0.079228519    1.5254468      3.2135679    
+       150   0.86965411     0.90456016     0.07493355     1.8484231      4.3821925    
+       200   0.45           1.01454        0.10663502     1.5708         4.7598476    
+       250   0.79636561     0.82567712     0.12105337     1.7424325      5.4983899    
+       300   0.45           0.86475538     0.11819875     1.4325791      5.8554758    
+       350   0.72135464     0.70693069     0.10912636     1.5368106      6.0388247    
+       400   0.45           0.75067331     0.14165013     1.3419484      6.3840708    
+       450   0.64839221     0.62402486     0.14173679     1.4136135      6.4791009    
+       500   0.45           0.66669513     0.13695201     1.2532721      6.807146     
+Loop time of 0.0138659 on 4 procs for 500 steps with 1200 atoms
+
+Performance: 15577811.312 tau/day, 36059.748 timesteps/s, 43.272 Matom-step/s
+99.9% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.0053896  | 0.0057144  | 0.0060899  |   0.4 | 41.21
+Bond    | 0.00020074 | 0.00021422 | 0.00022291 |   0.0 |  1.54
+Neigh   | 0.0025301  | 0.0025401  | 0.0025501  |   0.0 | 18.32
+Comm    | 0.0031194  | 0.0035074  | 0.0038196  |   0.4 | 25.30
+Output  | 6.4137e-05 | 6.7743e-05 | 7.7909e-05 |   0.0 |  0.49
+Modify  | 0.0013391  | 0.0013582  | 0.0013972  |   0.1 |  9.80
+Other   |            | 0.0004638  |            |       |  3.34
+
+Nlocal:            300 ave         304 max         292 min
+Histogram: 1 0 0 0 0 0 0 0 2 1
+Nghost:          103.5 ave         108 max          98 min
+Histogram: 1 0 0 1 0 0 0 0 0 2
+Neighs:          773.5 ave         792 max         735 min
+Histogram: 1 0 0 0 0 0 0 0 2 1
+
+Total # of neighbors = 3094
+Ave neighs/atom = 2.5783333
+Ave special neighs/atom = 0.5
+Neighbor list builds = 52
+Dangerous builds = 0
+
+unfix		3
+
+# Main run
+
+pair_style	lj/cut 2.5
+
+# solvent/head - full-size and long-range
+
+pair_coeff	1 1 1.0 1.0 2.5
+pair_coeff	2 2 1.0 1.0 2.5
+pair_coeff	1 2 1.0 1.0 2.5
+
+# tail/tail - size-averaged and long-range
+
+pair_coeff	3 3 1.0 0.75 2.5
+pair_coeff	4 4 1.0 0.50 2.5
+pair_coeff	3 4 1.0 0.67 2.5
+
+# solvent/tail - full-size and repulsive
+
+pair_coeff	1 3 1.0 1.0 1.12246
+pair_coeff	1 4 1.0 1.0 1.12246
+
+# head/tail - size-averaged and repulsive
+
+pair_coeff	2 3 1.0 0.88 1.12246
+pair_coeff	2 4 1.0 0.75 1.12246
+
+thermo		50
+
+#dump		1 all atom 2000 dump.micelle
+
+#dump		2 all image 2000 image.*.jpg type type zoom 1.6
+#dump_modify	2 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
+
+#dump		3 all movie 2000 movie.mpg type type zoom 1.6
+#dump_modify	3 pad 5 adiam 1 0.5 adiam 2 1.5 adiam 3 1.0 adiam 4 0.75
+
+reset_timestep	0
+group solvent   molecule 0
+750 atoms in group solvent
+group solute    subtract all solvent
+450 atoms in group solute
+unfix 1
+unfix 2
+unfix 4
+fix		1 solvent nve
+fix		2 solvent temp/rescale 100 0.45 0.45 0.02 1.0
+fix		5 solute rigid molecule langevin 0.45 0.45 0.5 112211
+  150 rigid bodies with 450 atoms
+fix		4 all enforce2d
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 5 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2.8
+  ghost atom cutoff = 2.8
+  binsize = 1.4, bins = 26 26 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/2d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.375 | 5.375 | 5.375 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.45318168    -1.3753652      0.13695201    -0.8705807      1.975423     
+        50   0.77344732    -1.6944083      0.13695201    -0.92967487     0.58657109   
+       100   0.53530681    -1.7006195      0.13695201    -1.1291768      0.11219772   
+       150   0.60820175    -1.8071581      0.13695201    -1.176549       1.5161796    
+       200   0.49410558    -1.7945459      0.13695201    -1.2565449      4.0469262    
+       250   0.52460847    -1.8528672      0.13695201    -1.290108       2.9929445    
+       300   0.46596803    -1.8680499      0.13695201    -1.3528872      2.7958851    
+       350   0.48831812    -1.8723486      0.13695201    -1.3390451     -4.5106818    
+       400   0.46798432    -1.9008529      0.13695201    -1.3840536     -4.3096566    
+       450   0.46000658    -1.9081144      0.13695201    -1.3977904      3.3360611    
+       500   0.45822409    -1.9077531      0.13695201    -1.3988759      0.45428738   
+Loop time of 0.0381773 on 4 procs for 500 steps with 1200 atoms
+
+Performance: 5657810.772 tau/day, 13096.784 timesteps/s, 15.716 Matom-step/s
+99.6% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.0059651  | 0.0062314  | 0.0066404  |   0.3 | 16.32
+Bond    | 0.00021057 | 0.00022477 | 0.0002333  |   0.0 |  0.59
+Neigh   | 0.0041424  | 0.0041487  | 0.0041512  |   0.0 | 10.87
+Comm    | 0.004264   | 0.0047244  | 0.0050297  |   0.4 | 12.37
+Output  | 8.2396e-05 | 8.6559e-05 | 9.6749e-05 |   0.0 |  0.23
+Modify  | 0.021833   | 0.021946   | 0.022094   |   0.1 | 57.48
+Other   |            | 0.0008157  |            |       |  2.14
+
+Nlocal:            300 ave         303 max         296 min
+Histogram: 1 0 0 0 1 0 0 0 1 1
+Nghost:         216.25 ave         219 max         214 min
+Histogram: 1 0 1 0 0 0 1 0 0 1
+Neighs:        2189.75 ave        2205 max        2173 min
+Histogram: 1 0 0 0 1 0 1 0 0 1
+
+Total # of neighbors = 8759
+Ave neighs/atom = 7.2991667
+Ave special neighs/atom = 0.5
+Neighbor list builds = 46
+Dangerous builds = 2
+unfix 2
+unfix 4
+unfix 5
+fix		5 solute rigid/small molecule
+  create bodies CPU = 0.000 seconds
+  150 rigid bodies with 450 atoms
+  1.3043524 = max distance from body owner to body atom
+fix		4 all enforce2d
+run		500
+Generated 0 of 6 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.233 | 9.233 | 9.234 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+       500   0.45822409    -1.9077531      0.13695201    -1.3988759      2.4509752    
+       550   0.46736204    -1.9141964      0.13695201    -1.3979022      2.1695662    
+       600   0.47872194    -1.9232781      0.13695201    -1.3977635      2.0058379    
+       650   0.47491575    -1.9224109      0.13695201    -1.3999857      2.0637789    
+       700   0.44714331    -1.8990682      0.13695201    -1.3991848      2.4863082    
+       750   0.49089274    -1.9231004      0.13695201    -1.3877071      2.123147     
+       800   0.4753839     -1.8959698      0.13695201    -1.3731645      2.3030481    
+       850   0.46870816    -1.8972225      0.13695201    -1.3798357      2.2464703    
+       900   0.49610454    -1.9070748      0.13695201    -1.3674513      2.2196388    
+       950   0.4773035     -1.8925765      0.13695201    -1.3682132      2.3534786    
+      1000   0.50413702    -1.9292393      0.13695201    -1.383096       2.1630987    
+Loop time of 0.0236819 on 4 procs for 500 steps with 1200 atoms
+
+Performance: 9120883.727 tau/day, 21113.157 timesteps/s, 25.336 Matom-step/s
+99.9% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.0058656  | 0.0059718  | 0.0061355  |   0.1 | 25.22
+Bond    | 0.0002083  | 0.00022447 | 0.00023485 |   0.0 |  0.95
+Neigh   | 0.0035477  | 0.0035644  | 0.0035824  |   0.0 | 15.05
+Comm    | 0.0041037  | 0.0042227  | 0.0043024  |   0.1 | 17.83
+Output  | 7.4355e-05 | 7.8273e-05 | 8.7777e-05 |   0.0 |  0.33
+Modify  | 0.008976   | 0.0090549  | 0.0091663  |   0.1 | 38.24
+Other   |            | 0.0005654  |            |       |  2.39
+
+Nlocal:            300 ave         306 max         295 min
+Histogram: 1 0 1 0 0 1 0 0 0 1
+Nghost:            221 ave         226 max         217 min
+Histogram: 1 0 0 1 1 0 0 0 0 1
+Neighs:         2163.5 ave        2271 max        2100 min
+Histogram: 1 1 0 1 0 0 0 0 0 1
+
+Total # of neighbors = 8654
+Ave neighs/atom = 7.2116667
+Ave special neighs/atom = 0.5
+Neighbor list builds = 39
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -33,6 +33,7 @@

 //#define ASYNC_DEVICE_COPY

+#if 0
 #if !defined(USE_OPENCL) && !defined(USE_HIP)
 // temporary workaround for int2 also defined in cufft
 #ifdef int2
@ -40,6 +41,7 @@
 #endif
 #include "cufft.h"
 #endif
+#endif

 namespace LAMMPS_AL {

@ -313,10 +315,11 @@ class BaseAmoeba {
  virtual int fphi_mpole();
  virtual int polar_real(const int eflag, const int vflag) = 0;

-
+#if 0
  #if !defined(USE_OPENCL) && !defined(USE_HIP)
  cufftHandle plan;
  #endif
+#endif
  bool fft_plan_created;
 };

--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,105 @@
 # CHANGELOG

+## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00)
+
+### Features:
+* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801)
+
+### Backend and Architecture Enhancements:
+
+#### CUDA:
+* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782)
+* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701)
+* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704)
+* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615)
+
+#### HIP:
+ * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857)
+ * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793)
+
+#### SYCL:
+* We only support OneAPI SYCL implementation: add check during initialization
+  * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784)
+  * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784)
+* Performance Improvements
+  * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739)
+  * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500)
+* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739)
+* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870)
+
+#### OpenMPTarget:
+* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380)
+* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585)
+* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735)
+* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652)
+
+#### OpenACC:
+* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446)
+* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772)
+
+#### Threads:
+* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446)
+
+#### OpenMP:
+* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573)
+
+### General Enhancements
+
+* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556)
+* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598)
+* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373)
+* Provide new public headers `<Kokkos_Clamp.hpp>` and `<Kokkos_MinMax.hpp>` [\#6687](https://github.com/kokkos/kokkos/pull/6687)
+* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747)
+* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713)
+* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243)
+* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524)
+* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813)
+* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855)
+* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850)
+* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516)
+
+### Build System Changes
+* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692)
+* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773)
+* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733)
+* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606)
+* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898)
+
+### Incompatibilities (i.e. breaking changes)
+* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it  [\#6523](https://github.com/kokkos/kokkos/pull/6523)
+* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665)
+* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690)
+* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726)
+* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754)
+* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579)
+* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593)
+* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190)
+* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642)
+* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845)
+* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861)
+* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797)
+* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557)
+* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791)
+* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798)
+* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806)
+* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744)
+
+### Deprecations
+* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697)
+* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710)
+* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582)
+
+### Bug Fixes
+* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511)
+* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334)
+* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667)
+* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658)
+* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777)
+* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786)
+* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821)
+* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892)
+
 ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01)

@ -999,95 +1099,95 @@
 - Major update for OpenMPTarget: many capabilities now work. For details contact us.
 - Added DPC++/SYCL backend: primary capabilites are working.
 - Added Kokkos Graph API analogous to CUDA Graphs.
- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536)
- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546)
- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439)
- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379)
+- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536)
+- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546)
+- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439)
+- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379)

 **Implemented enhancements Backends and Archs:**
- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614)
- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375)
- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583)
- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577)
- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544)
- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550)
- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480)
- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474)
- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451)
- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447)
- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504)
- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411)
- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440)
- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418)
- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366)
+- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614)
+- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375)
+- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583)
+- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577)
+- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544)
+- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550)
+- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480)
+- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474)
+- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451)
+- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447)
+- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504)
+- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411)
+- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440)
+- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418)
+- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366)

 **Implemented enhancements Policies:**
- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494)
- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527)
- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395)
- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362)
- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369)
- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206)
- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509)
+- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494)
+- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527)
+- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395)
+- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362)
+- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369)
+- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206)
+- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509)

 **Implemented enhancements BuildSystem:**
- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488)
- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548)
- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136)
- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434)
- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402)
- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457)
+- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488)
+- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548)
+- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136)
+- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434)
+- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402)
+- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457)

 **Implemented enhancements Tools:**
- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455)
- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530)
- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518)
- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459)
- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326)
+- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455)
+- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530)
+- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518)
+- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459)
+- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326)

 **Implemented enhancements Other:**
- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528)
- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449)
- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436)
- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435)
- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422)
- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416)
- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388)
- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359)
- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357)
- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340)
- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339)
- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338)
- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309)
- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265)
- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941)
+- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528)
+- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449)
+- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436)
+- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435)
+- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422)
+- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416)
+- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388)
+- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359)
+- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357)
+- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340)
+- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339)
+- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338)
+- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309)
+- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265)
+- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941)

 **Fixed bugs:**
- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591)
- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588)
- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566)
- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565)
- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532)
- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529)
- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510)
- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503)
- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467)
- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458)
- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398)
- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393)
- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390)
- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378)
- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348)
- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345)
- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343)
- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260)
+- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591)
+- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588)
+- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566)
+- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565)
+- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532)
+- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529)
+- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510)
+- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503)
+- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467)
+- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458)
+- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398)
+- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393)
+- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390)
+- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378)
+- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348)
+- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345)
+- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343)
+- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260)

 **Incompatibilities:**
- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535)
- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534)
- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301)
- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264)
- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148)
+- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535)
+- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534)
+- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301)
+- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264)
+- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148)

 ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01)
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -150,8 +150,8 @@ ENDIF()


 set(Kokkos_VERSION_MAJOR 4)
-set(Kokkos_VERSION_MINOR 2)
-set(Kokkos_VERSION_PATCH 1)
+set(Kokkos_VERSION_MINOR 3)
+set(Kokkos_VERSION_PATCH 0)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 message(STATUS "Kokkos version: ${Kokkos_VERSION}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS)
 endif

 KOKKOS_VERSION_MAJOR = 4
-KOKKOS_VERSION_MINOR = 2
-KOKKOS_VERSION_PATCH = 1
+KOKKOS_VERSION_MINOR = 3
+KOKKOS_VERSION_PATCH = 0
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -22,14 +22,14 @@ KOKKOS_DEVICES ?= "OpenMP"
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
-# IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100
+# IBM:      Power8,Power9
+# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
-# Options: hwloc,librt,experimental_memkind
+# Options: hwloc
 KOKKOS_USE_TPLS ?= ""
 # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b
 KOKKOS_CXX_STANDARD ?= "c++17"
@ -56,7 +56,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(
 uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT)
 # Return a 1 if a string contains a substring and 0 if not
 # Note the search string should be without '"'
-# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
+# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc)
 #   Will return a 1
 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0)
 # Returns 1 if the path exists, 0 otherwise
@ -73,11 +73,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),
 KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
 KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23)
 KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b)
+KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26)
+KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c)

 # Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
-KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt)
-KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind)

 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
@ -318,7 +318,6 @@ endif

 # Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC)
-KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
@ -398,11 +397,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX)
 KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc))

 # IBM based.
-KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ)
-KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7)
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8)
 KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9)
-KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
+KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))

 # AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
@ -413,22 +410,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
  endif
 endif
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
+
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A)
+endif
 KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940)
 KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942)
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
-KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030)
+endif
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0)
+  KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100)
+endif

 # Any AVX?
-KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))

 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -573,6 +585,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23")
 endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1)
+  #I cannot make CMake add this in a good way - so add it here
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1)
+  #I cannot make CMake add this in a good way - so add it here
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26")
+endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -612,27 +634,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC")
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT")
-  KOKKOS_LIBS += -lrt
-  KOKKOS_TPL_LIBRARY_NAMES += rt
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-  ifneq ($(KOKKOS_CMAKE), yes)
-    ifneq ($(MEMKIND_PATH),)
-      KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
-      KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib
-      KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
-      KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
-      KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib
-    endif
-    KOKKOS_LIBS += -lmemkind -lnuma
-    KOKKOS_TPL_LIBRARY_NAMES += memkind numa
-  endif
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE")
-endif
-
 ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS")
 endif
@ -699,10 +700,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    endif
  endif

-  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
-  endif
-
  ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC")
  else
@ -827,20 +824,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
  endif
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42")
-
-  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-    KOKKOS_CXXFLAGS += -xSSE4.2
-    KOKKOS_LDFLAGS  += -xSSE4.2
-  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-  else
-    # Assume that this is a really a GNU compiler.
-    KOKKOS_CXXFLAGS += -msse4.2
-    KOKKOS_LDFLAGS  += -msse4.2
-  endif
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX")

@ -1249,7 +1232,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
@ -1289,10 +1271,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-  endif
 endif

 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
@ -1403,11 +1381,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
  KOKKOS_TPL_LIBRARY_NAMES += hpx
 endif

-# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning.
-ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
-endif
-
 # With Cygwin functions such as fdopen and fileno are not defined
 # when strict ansi is enabled. strict ansi gets enabled with -std=c++14
 # though. So we hard undefine it here. Not sure if that has any bad side effects
@ -1461,6 +1434,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 else
  tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */")
 endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1)
+  tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC")
+else
+  tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */")
+endif
 tmp := $(call desul_append_header, "")
 tmp := $(call desul_append_header, "$H""endif")

@ -1493,7 +1472,7 @@ include $(KOKKOS_PATH)/Makefile.targets
 kokkos-clean:
 	rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \
 	KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \
-        KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp
+        KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp

 libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
 	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
-Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
 Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
 Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
@ -30,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
 Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
-Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
 Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
 Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
@ -82,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
-Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp
+Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -123,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC
 Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp
 endif
-
-Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@ -28,7 +28,7 @@ To start learning about Kokkos:

 - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability.

-For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
+For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue.

 For non-public questions send an email to: *crtrott(at)sandia.gov*

@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati

 # License

-[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
+[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html)

 Under the terms of Contract DE-NA0003525 with NTESS,
 the U.S. Government retains certain rights in this software.

-The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or
-[here](https://github.com/kokkos/kokkos/blob/master/LICENSE).
+The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or
+[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE).
--- a/lib/kokkos/SECURITY.md
+++ b/lib/kokkos/SECURITY.md
@ -0,0 +1,12 @@
+# Reporting Security Issues
+
+To report a security issue, please email
+[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov)
+and [crtrott@sandia.gov](mailto:crtrott@sandia.gov)
+with a description of the issue, the steps you took to create the issue,
+affected versions, and, if known, mitigations for the issue.
+
+Our vulnerability management team will respond within 5 working days of your
+email. If the issue is confirmed as a vulnerability, we will open a
+Security Advisory and acknowledge your contributions as part of it. This project
+follows a 90 day disclosure timeline.
--- a/lib/kokkos/Spack.md
+++ b/lib/kokkos/Spack.md
@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you
 > spack install superscience
 ````
 you may end up just getting the default Kokkos (i.e. Serial).
-Some examples are included in the `config/yaml` folder for common platforms.
 Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct.
 For example, with Kokkos Kernels:
 ````bash
--- a/lib/kokkos/algorithms/src/CMakeLists.txt
+++ b/lib/kokkos/algorithms/src/CMakeLists.txt
@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
  ${CMAKE_CURRENT_SOURCE_DIR}
 )

-
-
+KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST)
+KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL)
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -849,18 +849,17 @@ class Random_XorShift64 {
    return drand(end - start) + start;
  }

-  // Marsaglia polar method for drawing a standard normal distributed random
+  // Box-muller method for drawing a standard normal distributed random
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * std::log(S) / S);
+    constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
+
+    const double u     = drand();
+    const double v     = drand();
+    const double r     = Kokkos::sqrt(-2.0 * Kokkos::log(u));
+    const double theta = v * two_pi;
+    return r * Kokkos::cos(theta);
  }

  KOKKOS_INLINE_FUNCTION
@ -1094,18 +1093,17 @@ class Random_XorShift1024 {
    return drand(end - start) + start;
  }

-  // Marsaglia polar method for drawing a standard normal distributed random
+  // Box-muller method for drawing a standard normal distributed random
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * std::log(S) / S);
+    constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
+
+    const double u     = drand();
+    const double v     = drand();
+    const double r     = Kokkos::sqrt(-2.0 * Kokkos::log(u));
+    const double theta = v * two_pi;
+    return r * Kokkos::cos(theta);
  }

  KOKKOS_INLINE_FUNCTION
@ -1545,13 +1543,23 @@ template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type begin,
                 typename ViewType::const_value_type end) {
-  fill_random(typename ViewType::execution_space{}, a, g, begin, end);
+  Kokkos::fence(
+      "fill_random: fence before since no execution space instance provided");
+  typename ViewType::execution_space exec;
+  fill_random(exec, a, g, begin, end);
+  exec.fence(
+      "fill_random: fence after since no execution space instance provided");
 }

 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type range) {
-  fill_random(typename ViewType::execution_space{}, a, g, 0, range);
+  Kokkos::fence(
+      "fill_random: fence before since no execution space instance provided");
+  typename ViewType::execution_space exec;
+  fill_random(exec, a, g, 0, range);
+  exec.fence(
+      "fill_random: fence after since no execution space instance provided");
 }

 }  // namespace Kokkos
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -23,6 +23,7 @@

 #include "sorting/Kokkos_BinSortPublicAPI.hpp"
 #include "sorting/Kokkos_SortPublicAPI.hpp"
+#include "sorting/Kokkos_SortByKeyPublicAPI.hpp"
 #include "sorting/Kokkos_NestedSortPublicAPI.hpp"

 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
--- a/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
@ -35,7 +35,6 @@
 // following the std classification.

 // modifying ops
-#include "std_algorithms/Kokkos_Swap.hpp"
 #include "std_algorithms/Kokkos_IterSwap.hpp"

 // non-modifying sequence
--- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp
+++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp
@ -0,0 +1,117 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_
+#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_
+
+#include "./impl/Kokkos_SortByKeyImpl.hpp"
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+
+namespace Kokkos::Experimental {
+
+// ---------------------------------------------------------------
+// basic overloads
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  // constraints
+  using KeysType   = Kokkos::View<KeysDataType, KeysProperties...>;
+  using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>;
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys);
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values);
+
+  static_assert(SpaceAccessibility<ExecutionSpace,
+                                   typename KeysType::memory_space>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the keys View argument!");
+  static_assert(
+      SpaceAccessibility<ExecutionSpace,
+                         typename ValuesType::memory_space>::accessible,
+      "Kokkos::sort: execution space instance is not able to access "
+      "the memory space of the values View argument!");
+
+  static_assert(KeysType::static_extent(0) == 0 ||
+                ValuesType::static_extent(0) == 0 ||
+                KeysType::static_extent(0) == ValuesType::static_extent(0));
+  if (values.size() != keys.size())
+    Kokkos::abort((std::string("values and keys extents must be the same. The "
+                               "values extent is ") +
+                   std::to_string(values.size()) + ", and the keys extent is " +
+                   std::to_string(keys.size()) + ".")
+                      .c_str());
+
+  if (keys.extent(0) <= 1) {
+    return;
+  }
+
+  ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys,
+                                                             values);
+}
+
+// ---------------------------------------------------------------
+// overloads supporting a custom comparator
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class ComparatorType, class KeysDataType,
+          class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  // constraints
+  using KeysType   = Kokkos::View<KeysDataType, KeysProperties...>;
+  using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>;
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys);
+  ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values);
+
+  static_assert(SpaceAccessibility<ExecutionSpace,
+                                   typename KeysType::memory_space>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the keys View argument!");
+  static_assert(
+      SpaceAccessibility<ExecutionSpace,
+                         typename ValuesType::memory_space>::accessible,
+      "Kokkos::sort: execution space instance is not able to access "
+      "the memory space of the values View argument!");
+
+  static_assert(KeysType::static_extent(0) == 0 ||
+                ValuesType::static_extent(0) == 0 ||
+                KeysType::static_extent(0) == ValuesType::static_extent(0));
+  if (values.size() != keys.size())
+    Kokkos::abort((std::string("values and keys extents must be the same. The "
+                               "values extent is ") +
+                   std::to_string(values.size()) + ", and the keys extent is " +
+                   std::to_string(keys.size()) + ".")
+                      .c_str());
+
+  if (keys.extent(0) <= 1) {
+    return;
+  }
+
+  ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values,
+                                                          comparator);
+}
+
+}  // namespace Kokkos::Experimental
+#endif
--- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
+++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
@ -29,7 +29,7 @@ namespace Kokkos {
 // ---------------------------------------------------------------

 template <class ExecutionSpace, class DataType, class... Properties>
-void sort([[maybe_unused]] const ExecutionSpace& exec,
+void sort(const ExecutionSpace& exec,
          const Kokkos::View<DataType, Properties...>& view) {
  // constraints
  using ViewType = Kokkos::View<DataType, Properties...>;
@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec,
  }

  if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    exec.fence("Kokkos::sort without comparator use std::sort");
    auto first = ::Kokkos::Experimental::begin(view);
    auto last  = ::Kokkos::Experimental::end(view);
    std::sort(first, last);
@ -82,7 +83,7 @@ void sort(const Kokkos::View<DataType, Properties...>& view) {
 // ---------------------------------------------------------------
 template <class ExecutionSpace, class ComparatorType, class DataType,
          class... Properties>
-void sort([[maybe_unused]] const ExecutionSpace& exec,
+void sort(const ExecutionSpace& exec,
          const Kokkos::View<DataType, Properties...>& view,
          const ComparatorType& comparator) {
  // constraints
@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec,
  }

  if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    exec.fence("Kokkos::sort with comparator use std::sort");
    auto first = ::Kokkos::Experimental::begin(view);
    auto last  = ::Kokkos::Experimental::end(view);
    std::sort(first, last, comparator);
--- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
+++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
@ -18,7 +18,6 @@
 #define KOKKOS_NESTED_SORT_IMPL_HPP_

 #include <Kokkos_Core.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>

 namespace Kokkos {
 namespace Experimental {
@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl(
            keyView(elem1) = key2;
            keyView(elem2) = key1;
            if constexpr (!std::is_same_v<ValueViewType, std::nullptr_t>) {
-              Kokkos::Experimental::swap(valueView(elem1), valueView(elem2));
+              Kokkos::kokkos_swap(valueView(elem1), valueView(elem2));
            }
          }
        }
--- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp
+++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp
@ -0,0 +1,401 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_
+#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_
+
+#include <Kokkos_Core.hpp>
+
+#if defined(KOKKOS_ENABLE_CUDA)
+
+// Workaround for `Instruction 'shfl' without '.sync' is not supported on
+// .target sm_70 and higher from PTX ISA version 6.4`.
+// Also see https://github.com/NVIDIA/cub/pull/170.
+#if !defined(CUB_USE_COOPERATIVE_GROUPS)
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+
+#if defined(KOKKOS_COMPILER_CLANG)
+// Some versions of Clang fail to compile Thrust, failing with errors like
+// this:
+//    <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
+//    error: use of undeclared identifier 'va_printf'
+// The exact combination of versions for Clang and Thrust (or CUDA) for this
+// failure was not investigated, however even very recent version combination
+// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
+//
+// Defining _CubLog here locally allows us to avoid that code path, however
+// disabling some debugging diagnostics
+#pragma push_macro("_CubLog")
+#ifdef _CubLog
+#undef _CubLog
+#endif
+#define _CubLog
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#pragma pop_macro("_CubLog")
+#else
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+#pragma GCC diagnostic pop
+
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL) && \
+    (ONEDPL_VERSION_MAJOR > 2022 ||  \
+     (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2))
+#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#endif
+
+namespace Kokkos::Impl {
+
+template <typename T>
+constexpr inline bool is_admissible_to_kokkos_sort_by_key =
+    ::Kokkos::is_view<T>::value&& T::rank() == 1 &&
+    (std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutLeft>::value ||
+     std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutRight>::value ||
+     std::is_same<typename T::traits::array_layout,
+                  Kokkos::LayoutStride>::value);
+
+template <class ViewType>
+KOKKOS_INLINE_FUNCTION constexpr void
+static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) {
+  static_assert(is_admissible_to_kokkos_sort_by_key<ViewType>,
+                "Kokkos::sort_by_key only accepts 1D values View with "
+                "LayoutRight, LayoutLeft or LayoutStride.");
+}
+
+// For the fallback implementation for sort_by_key using Kokkos::sort, we need
+// to consider if Kokkos::sort defers to the fallback implementation that copies
+// the array to the host and uses std::sort, see
+// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If
+// sort_on_device_v is true, we assume that std::sort doesn't copy data.
+// Otherwise, we manually copy all data to the host and provide Kokkos::sort
+// with a host execution space.
+template <class ExecutionSpace, class Layout>
+inline constexpr bool sort_on_device_v = false;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::Cuda, Layout> = true;
+
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_cudathrust(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  const auto policy = thrust::cuda::par.on(exec.cuda_stream());
+  auto keys_first   = ::Kokkos::Experimental::begin(keys);
+  auto keys_last    = ::Kokkos::Experimental::end(keys);
+  auto values_first = ::Kokkos::Experimental::begin(values);
+  thrust::sort_by_key(policy, keys_first, keys_last, values_first,
+                      std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::HIP, Layout> = true;
+
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_rocthrust(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  const auto policy = thrust::hip::par.on(exec.hip_stream());
+  auto keys_first   = ::Kokkos::Experimental::begin(keys);
+  auto keys_last    = ::Kokkos::Experimental::end(keys);
+  auto values_first = ::Kokkos::Experimental::begin(values);
+  thrust::sort_by_key(policy, keys_first, keys_last, values_first,
+                      std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class Layout>
+inline constexpr bool sort_on_device_v<Kokkos::Experimental::SYCL, Layout> =
+    std::is_same_v<Layout, Kokkos::LayoutLeft> ||
+    std::is_same_v<Layout, Kokkos::LayoutRight>;
+
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties, class... MaybeComparator>
+void sort_by_key_onedpl(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  if (keys.stride(0) != 1 && values.stride(0) != 1) {
+    Kokkos::abort(
+        "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1.");
+  }
+
+  // Can't use Experimental::begin/end here since the oneDPL then assumes that
+  // the data is on the host.
+  auto queue  = exec.sycl_queue();
+  auto policy = oneapi::dpl::execution::make_device_policy(queue);
+  const int n = keys.extent(0);
+  oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(),
+                           std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+#endif
+
+template <typename ExecutionSpace, typename PermutationView, typename ViewType>
+void applyPermutation(const ExecutionSpace& space,
+                      const PermutationView& permutation,
+                      const ViewType& view) {
+  static_assert(std::is_integral<typename PermutationView::value_type>::value);
+
+  auto view_copy = Kokkos::create_mirror(
+      Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{},
+                         Kokkos::WithoutInitializing),
+      view);
+  Kokkos::deep_copy(space, view_copy, view);
+  Kokkos::parallel_for(
+      "Kokkos::sort_by_key_via_sort::permute_" + view.label(),
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, view.extent(0)),
+      KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); });
+}
+
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties,
+          class... MaybeComparator>
+void sort_by_key_via_sort(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    MaybeComparator&&... maybeComparator) {
+  static_assert(sizeof...(MaybeComparator) <= 1);
+
+  auto const n = keys.size();
+
+  Kokkos::View<unsigned int*, ExecutionSpace> permute(
+      Kokkos::view_alloc(exec, Kokkos::WithoutInitializing,
+                         "Kokkos::sort_by_key_via_sort::permute"),
+      n);
+
+  // iota
+  Kokkos::parallel_for(
+      "Kokkos::sort_by_key_via_sort::iota",
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+      KOKKOS_LAMBDA(int i) { permute(i) = i; });
+
+  using Layout =
+      typename Kokkos::View<unsigned int*, ExecutionSpace>::array_layout;
+  if constexpr (!sort_on_device_v<ExecutionSpace, Layout>) {
+    auto host_keys = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing),
+        keys);
+    auto host_permute = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing),
+        permute);
+    Kokkos::deep_copy(exec, host_keys, keys);
+    Kokkos::deep_copy(exec, host_permute, permute);
+
+    exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort");
+    Kokkos::DefaultHostExecutionSpace host_exec;
+
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          host_exec, host_permute,
+          KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(host_keys(i), host_keys(j));
+          });
+    }
+    host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort");
+    Kokkos::deep_copy(exec, permute, host_permute);
+  } else {
+#ifdef KOKKOS_ENABLE_SYCL
+    auto* raw_keys_in_comparator = keys.data();
+    auto stride                  = keys.stride(0);
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return raw_keys_in_comparator[i * stride] <
+                   raw_keys_in_comparator[j * stride];
+          });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(raw_keys_in_comparator[i * stride],
+                                   raw_keys_in_comparator[j * stride]);
+          });
+    }
+#else
+    if constexpr (sizeof...(MaybeComparator) == 0) {
+      Kokkos::sort(
+          exec, permute,
+          KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); });
+    } else {
+      auto keys_comparator =
+          std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...));
+      Kokkos::sort(
+          exec, permute, KOKKOS_LAMBDA(int i, int j) {
+            return keys_comparator(keys(i), keys(j));
+          });
+    }
+#endif
+  }
+
+  applyPermutation(exec, permute, keys);
+  applyPermutation(exec, permute, values);
+}
+
+// ------------------------------------------------------
+//
+// specialize cases for sorting by key without comparator
+//
+// ------------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_cudathrust(exec, keys, values);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_rocthrust(exec, keys, values);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class KeysDataType, class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+void sort_by_key_device_view_without_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+  if (keys.stride(0) == 1 && values.stride(0) == 1)
+    sort_by_key_onedpl(exec, keys, values);
+  else
+#endif
+    sort_by_key_via_sort(exec, keys, values);
+}
+#endif
+
+// fallback case
+template <class ExecutionSpace, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_by_key_device_view_without_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values) {
+  sort_by_key_via_sort(exec, keys, values);
+}
+
+// ---------------------------------------------------
+//
+// specialize cases for sorting by key with comparator
+//
+// ---------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::Cuda& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_cudathrust(exec, keys, values, comparator);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::HIP& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_rocthrust(exec, keys, values, comparator);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class ComparatorType, class KeysDataType, class... KeysProperties,
+          class ValuesDataType, class... ValuesProperties>
+void sort_by_key_device_view_with_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+  if (keys.stride(0) == 1 && values.stride(0) == 1)
+    sort_by_key_onedpl(exec, keys, values, comparator);
+  else
+#endif
+    sort_by_key_via_sort(exec, keys, values, comparator);
+}
+#endif
+
+// fallback case
+template <class ComparatorType, class ExecutionSpace, class KeysDataType,
+          class... KeysProperties, class ValuesDataType,
+          class... ValuesProperties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_by_key_device_view_with_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<KeysDataType, KeysProperties...>& keys,
+    const Kokkos::View<ValuesDataType, ValuesProperties...>& values,
+    const ComparatorType& comparator) {
+  sort_by_key_via_sort(exec, keys, values, comparator);
+}
+
+#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY
+
+}  // namespace Kokkos::Impl
+#endif
--- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
+++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
@ -63,6 +63,11 @@

 #endif

+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 #include <oneapi/dpl/execution>
 #include <oneapi/dpl/algorithm>
@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space,
 }
 #endif

+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class DataType, class... Properties, class... MaybeComparator>
+void sort_rocthrust(const HIP& space,
+                    const Kokkos::View<DataType, Properties...>& view,
+                    MaybeComparator&&... maybeComparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+  const auto exec = thrust::hip::par.on(space.hip_stream());
+  auto first      = ::Kokkos::Experimental::begin(view);
+  auto last       = ::Kokkos::Experimental::end(view);
+  thrust::sort(exec, first, last,
+               std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class DataType, class... Properties, class... MaybeComparator>
 void sort_onedpl(const Kokkos::Experimental::SYCL& space,
@ -274,6 +299,14 @@ void sort_device_view_without_comparator(
 }
 #endif

+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class DataType, class... Properties>
+void sort_device_view_without_comparator(
+    const HIP& exec, const Kokkos::View<DataType, Properties...>& view) {
+  sort_rocthrust(exec, view);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class DataType, class... Properties>
 void sort_device_view_without_comparator(
@ -320,6 +353,15 @@ void sort_device_view_with_comparator(
 }
 #endif

+#if defined(KOKKOS_ENABLE_ROCTHRUST)
+template <class ComparatorType, class DataType, class... Properties>
+void sort_device_view_with_comparator(
+    const HIP& exec, const Kokkos::View<DataType, Properties...>& view,
+    const ComparatorType& comparator) {
+  sort_rocthrust(exec, view, comparator);
+}
+#endif
+
 #if defined(KOKKOS_ENABLE_ONEDPL)
 template <class ComparatorType, class DataType, class... Properties>
 void sort_device_view_with_comparator(
--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
@ -50,7 +50,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const ExecutionSpace& ex,
          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -66,7 +66,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const std::string& label, const ExecutionSpace& ex,
          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
@ -50,7 +50,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -65,7 +65,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const std::string& label, const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_backward(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
@ -54,7 +54,8 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+             const ::Kokkos::View<DataType2, Properties2...>& dest,
+             Predicate pred) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -69,7 +70,8 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+             const ::Kokkos::View<DataType2, Properties2...>& dest,
+             Predicate pred) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -96,7 +98,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_if(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
@ -51,7 +51,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
+            const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -66,7 +66,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
+            const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto copy_n(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
@ -80,7 +80,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
+           const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -96,7 +96,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
+           const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -111,7 +111,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
+           const ::Kokkos::View<DataType2, Properties2...>& view2,
           BinaryPredicateType predicate) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
@ -128,7 +128,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
+           const ::Kokkos::View<DataType2, Properties2...>& view2,
           BinaryPredicateType predicate) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
@ -227,7 +227,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION bool equal(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -243,7 +243,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION bool equal(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2,
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
    BinaryPredicateType predicate) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
@ -19,7 +19,6 @@

 #include <Kokkos_Core.hpp>
 #include "impl/Kokkos_Constraints.hpp"
-#include "Kokkos_Swap.hpp"

 namespace Kokkos {
 namespace Experimental {
@ -33,7 +32,7 @@ struct StdIterSwapFunctor {
  KOKKOS_FUNCTION
  void operator()(int i) const {
    (void)i;
-    ::Kokkos::Experimental::swap(*m_a, *m_b);
+    ::Kokkos::kokkos_swap(*m_a, *m_b);
  }

  KOKKOS_FUNCTION
@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) {
  Impl::iter_swap_impl(a, b);
 }

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+template <class T>
+KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!")
+KOKKOS_FUNCTION
+    void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval<T&>(),
+                                                         std::declval<T&>())) {
+  ::Kokkos::kokkos_swap(a, b);
+}
+#endif
+
 }  // namespace Experimental
 }  // namespace Kokkos

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
@ -54,7 +54,7 @@ template <
 bool lexicographical_compare(
    const ExecutionSpace& ex,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -71,7 +71,7 @@ template <
 bool lexicographical_compare(
    const std::string& label, const ExecutionSpace& ex,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -112,7 +112,8 @@ template <
 bool lexicographical_compare(
    const ExecutionSpace& ex,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -129,7 +130,8 @@ template <
 bool lexicographical_compare(
    const std::string& label, const ExecutionSpace& ex,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -161,7 +163,7 @@ template <class TeamHandleType, class DataType1, class... Properties1,
 KOKKOS_FUNCTION bool lexicographical_compare(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

@ -187,7 +189,8 @@ template <class TeamHandleType, class DataType1, class... Properties1,
 KOKKOS_FUNCTION bool lexicographical_compare(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    ComparatorType comp) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
@ -50,7 +50,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const ExecutionSpace& ex,
          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -64,7 +64,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const std::string& label, const ExecutionSpace& ex,
          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
+          const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto move(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
@ -41,7 +41,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -65,7 +65,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const std::string& label, const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+                   const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto move_backward(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
@ -50,7 +50,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -65,7 +65,7 @@ template <
    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto reverse_copy(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
@ -40,7 +40,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -64,7 +64,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto swap_ranges(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

--- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
@ -58,7 +58,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
+               const ::Kokkos::View<DataType2, Properties2...>& dest,
               UnaryOperation unary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
@ -73,7 +73,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
+               const ::Kokkos::View<DataType2, Properties2...>& dest,
               UnaryOperation unary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
@ -119,7 +119,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
 auto transform(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& source1,
               const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
+               const ::Kokkos::View<DataType3, Properties3...>& dest,
               BinaryOperation binary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
@ -137,7 +137,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1,
 auto transform(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& source1,
               const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
+               const ::Kokkos::View<DataType3, Properties3...>& dest,
               BinaryOperation binary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
@ -174,7 +174,8 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1,
 KOKKOS_FUNCTION auto transform(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source,
-    ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) {
+    const ::Kokkos::View<DataType2, Properties2...>& dest,
+    UnaryOperation unary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);

@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform(
    const TeamHandleType& teamHandle,
    const ::Kokkos::View<DataType1, Properties1...>& source1,
    const ::Kokkos::View<DataType2, Properties2...>& source2,
-    ::Kokkos::View<DataType3, Properties3...>& dest,
+    const ::Kokkos::View<DataType3, Properties3...>& dest,
    BinaryOperation binary_op) {
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement {
  KOKKOS_FUNCTION
  void operator()(const IndexType i, ValueType& update,
                  const bool final_pass) const {
+    const auto tmp = m_first_from[i];
    if (final_pass) m_first_dest[i] = update + m_init_value;
-    update += m_first_from[i];
+    update += tmp;
  }
 };

@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper {
  KOKKOS_FUNCTION
  void operator()(const IndexType i, value_type& update,
                  const bool final_pass) const {
+    const auto tmp = value_type{m_first_from[i], false};
    if (final_pass) {
      if (i == 0) {
        m_first_dest[i] = m_init_value;
@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper {
      }
    }

-    const auto tmp = value_type{m_first_from[i], false};
    this->join(update, tmp);
  }

@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper {
  KOKKOS_FUNCTION
  void operator()(const IndexType i, value_type& update,
                  const bool final_pass) const {
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
    if (final_pass) {
      if (i == 0) {
        // for both ExclusiveScan and TransformExclusiveScan,
@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper {
      }
    }

-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
    this->join(update, tmp);
  }

@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper {
  KOKKOS_FUNCTION
  void operator()(const IndexType i, ValueType& update,
                  const bool final_pass) const {
+    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
    if (final_pass) {
      if (i == 0) {
        // for both ExclusiveScan and TransformExclusiveScan,
@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper {
      }
    }

-    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
    this->join(update, tmp);
  }

--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor {
  void operator()(const IndexType i, IndexType& update,
                  const bool final_pass) const {
    auto& myval = m_first_from[i];
-    if (final_pass) {
-      if (!m_must_remove(myval)) {
+
+    if (!m_must_remove(myval)) {
+      if (final_pass) {
        // calling move here is ok because we are inside final pass
        // we are calling move assign as specified by the std
        m_first_dest[update] = std::move(myval);
      }
-    }

-    if (!m_must_remove(myval)) {
      update += 1;
    }
  }
@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label,
    // create helper tmp view
    using value_type    = typename IteratorType::value_type;
    using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-    tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count);
+    tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex,
+                                              "std_remove_if_tmp_view"),
+                           keep_count);
    using tmp_readwrite_iterator_type = decltype(begin(tmp_view));

    // in stage 1, *move* all elements to keep from original range to tmp
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
@ -21,7 +21,6 @@
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
 #include <string>

 namespace Kokkos {
@ -39,7 +38,7 @@ struct StdReverseFunctor {

  KOKKOS_FUNCTION
  void operator()(index_type i) const {
-    ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]);
+    ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]);
  }

  KOKKOS_FUNCTION
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl(
  // execution space impl because for this team impl we are
  // within a parallel region, so for now we solve serially

-  const std::size_t numElementsToMove =
+  using difference_type = typename IteratorType::difference_type;
+  const difference_type numElementsToMove =
      ::Kokkos::Experimental::distance(first + n, last);
  Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
-    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+    for (difference_type i = 0; i < numElementsToMove; ++i) {
      first[i] = std::move(first[i + n]);
    }
  });
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl(
  return first + n;
 }

-template <class Iterator>
-struct StdShiftRightTeamSingleFunctor {
-  Iterator m_first;
-  Iterator m_last;
-  std::size_t m_shift;
-
-  KOKKOS_FUNCTION
-  void operator()() const {
-    // the impl function calling this functor guarantees that
-    // - m_shift is non-negative
-    // - m_first, m_last identify a valid range with m_last > m_first
-    // - m_shift is less than m_last - m_first
-    // so I can safely use std::size_t here
-  }
-
-  KOKKOS_FUNCTION
-  StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n)
-      : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {}
-};
-
 template <class TeamHandleType, class IteratorType>
 KOKKOS_FUNCTION IteratorType shift_right_team_impl(
    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl(
  // execution space impl because for this team impl we are
  // within a parallel region, so for now we solve serially

-  const std::size_t numElementsToMove =
+  using difference_type = typename IteratorType::difference_type;
+  const difference_type numElementsToMove =
      ::Kokkos::Experimental::distance(first, last - n);
  Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
-    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+    for (difference_type i = 0; i < numElementsToMove; ++i) {
      last[-i - 1] = std::move(last[-n - i - 1]);
    }
  });
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
@ -21,7 +21,6 @@
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
 #include <string>

 namespace Kokkos {
@ -36,7 +35,7 @@ struct StdSwapRangesFunctor {

  KOKKOS_FUNCTION
  void operator()(index_type i) const {
-    ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]);
+    ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]);
  }

  KOKKOS_FUNCTION
--- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
+++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label,
      // using the same algorithm used for unique_copy but we now move things
      using value_type    = typename IteratorType::value_type;
      using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-      tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore);
+      tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing,
+                                                "std_unique_tmp_view"),
+                             num_elements_to_explore);

      // scan extent is: num_elements_to_explore - 1
      // for same reason as the one explained in unique_copy
--- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
    set(ALGO_SORT_SOURCES)
    foreach(SOURCE_Input
 	TestSort
+	TestSortByKey
 	TestSortCustomComp
 	TestBinSortA
 	TestBinSortB
@ -57,35 +58,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
      configure_file(${dir}/dummy.cpp ${file})
      list(APPEND ALGO_RANDOM_SOURCES ${file})
    endforeach()
+  endif()
+endforeach()

-    # ------------------------------------------
-    # std set A
-    # ------------------------------------------
-    set(STDALGO_SOURCES_A)
-    foreach(Name
+# ------------------------------------------
+# std set A
+# ------------------------------------------
+set(STDALGO_SOURCES_A)
+foreach(Name
 	StdReducers
 	StdAlgorithmsConstraints
 	RandomAccessIterator
-	)
-      list(APPEND STDALGO_SOURCES_A Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_A Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std set B
-    # ------------------------------------------
-    set(STDALGO_SOURCES_B)
-    foreach(Name
+# ------------------------------------------
+# std set B
+# ------------------------------------------
+set(STDALGO_SOURCES_B)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsMinMaxElementOps
-	)
-      list(APPEND STDALGO_SOURCES_B Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_B Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std set C
-    # ------------------------------------------
-    set(STDALGO_SOURCES_C)
-    foreach(Name
+# ------------------------------------------
+# std set C
+# ------------------------------------------
+set(STDALGO_SOURCES_C)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsLexicographicalCompare
 	StdAlgorithmsForEach
@ -100,15 +103,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsSearch_n
 	StdAlgorithmsMismatch
 	StdAlgorithmsMoveBackward
-	)
-      list(APPEND STDALGO_SOURCES_C Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_C Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std set D
-    # ------------------------------------------
-    set(STDALGO_SOURCES_D)
-    foreach(Name
+# ------------------------------------------
+# std set D
+# ------------------------------------------
+set(STDALGO_SOURCES_D)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsModOps
 	StdAlgorithmsModSeqOps
@ -128,15 +131,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsReverse
 	StdAlgorithmsShiftLeft
 	StdAlgorithmsShiftRight
-	)
-      list(APPEND STDALGO_SOURCES_D Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_D Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std set E
-    # ------------------------------------------
-    set(STDALGO_SOURCES_E)
-    foreach(Name
+# ------------------------------------------
+# std set E
+# ------------------------------------------
+set(STDALGO_SOURCES_E)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsIsSorted
 	StdAlgorithmsIsSortedUntil
@ -149,83 +152,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTransformUnaryOp
 	StdAlgorithmsTransformExclusiveScan
 	StdAlgorithmsTransformInclusiveScan
-	)
-      list(APPEND STDALGO_SOURCES_E Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_SOURCES_E Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team Q
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_Q)
-    foreach(Name
+# ------------------------------------------
+# std team Q
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_Q)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamInclusiveScan
 	StdAlgorithmsTeamTransformInclusiveScan
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team P
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_P)
-    foreach(Name
+# ------------------------------------------
+# std team P
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_P)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamExclusiveScan
 	StdAlgorithmsTeamTransformExclusiveScan
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team M
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_M)
-    foreach(Name
+# ------------------------------------------
+# std team M
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_M)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamTransformUnaryOp
 	StdAlgorithmsTeamTransformBinaryOp
 	StdAlgorithmsTeamGenerate
 	StdAlgorithmsTeamGenerate_n
 	StdAlgorithmsTeamSwapRanges
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team L
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_L)
-    foreach(Name
+# ------------------------------------------
+# std team L
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_L)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamIsSorted
 	StdAlgorithmsTeamIsSortedUntil
 	StdAlgorithmsTeamIsPartitioned
 	StdAlgorithmsTeamPartitionCopy
 	StdAlgorithmsTeamPartitionPoint
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team I
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_I)
-    foreach(Name
+# ------------------------------------------
+# std team I
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_I)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamUnique
 	StdAlgorithmsTeamAdjacentDifference
 	StdAlgorithmsTeamReduce
 	StdAlgorithmsTeamTransformReduce
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team H
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_H)
-    foreach(Name
+# ------------------------------------------
+# std team H
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_H)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamCopy
 	StdAlgorithmsTeamCopy_n
@ -236,43 +239,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamRemoveIf
 	StdAlgorithmsTeamRemoveCopy
 	StdAlgorithmsTeamRemoveCopyIf
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team G
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_G)
-    foreach(Name
+# ------------------------------------------
+# std team G
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_G)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamMove
 	StdAlgorithmsTeamMoveBackward
 	StdAlgorithmsTeamShiftLeft
 	StdAlgorithmsTeamShiftRight
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team F
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_F)
-    foreach(Name
+# ------------------------------------------
+# std team F
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_F)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamReverse
 	StdAlgorithmsTeamReverseCopy
 	StdAlgorithmsTeamRotate
 	StdAlgorithmsTeamRotateCopy
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team E
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_E)
-    foreach(Name
+# ------------------------------------------
+# std team E
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_E)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamFill
 	StdAlgorithmsTeamFill_n
@ -280,28 +283,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamReplaceIf
 	StdAlgorithmsTeamReplaceCopy
 	StdAlgorithmsTeamReplaceCopyIf
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team D
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_D)
-    foreach(Name
+# ------------------------------------------
+# std team D
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_D)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamMinElement
 	StdAlgorithmsTeamMaxElement
 	StdAlgorithmsTeamMinMaxElement
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team C
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_C)
-    foreach(Name
+# ------------------------------------------
+# std team C
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_C)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamFind
 	StdAlgorithmsTeamFindIf
@ -310,29 +313,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamAnyOf
 	StdAlgorithmsTeamNoneOf
 	StdAlgorithmsTeamSearchN
-	)
-      list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team B
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_B)
-    foreach(Name
+# ------------------------------------------
+# std team B
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_B)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamEqual
 	StdAlgorithmsTeamSearch
 	StdAlgorithmsTeamFindEnd
 	StdAlgorithmsTeamFindFirstOf
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp)
-    endforeach()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp)
+endforeach()

-    # ------------------------------------------
-    # std team A
-    # ------------------------------------------
-    set(STDALGO_TEAM_SOURCES_A)
-    foreach(Name
+# ------------------------------------------
+# std team A
+# ------------------------------------------
+set(STDALGO_TEAM_SOURCES_A)
+foreach(Name
 	StdAlgorithmsCommon
 	StdAlgorithmsTeamAdjacentFind
 	StdAlgorithmsTeamCount
@ -341,11 +344,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsTeamForEachN
 	StdAlgorithmsTeamLexicographicalCompare
 	StdAlgorithmsTeamMismatch
-      )
-      list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp)
-    endforeach()
-
-  endif()
+  )
+  list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp)
 endforeach()

 # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time.
--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@ -27,13 +27,13 @@ TARGETS =

 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
  $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\
-     $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
-     $(shell echo "\#include <TestRandom.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestSort.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestBinSortA.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestBinSortB.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestNestedSort.hpp>" >> Test$(device).cpp); \
-     $(shell echo "\#include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
+     $(shell echo "$(H)include <TestRandom.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestBinSortA.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestBinSortB.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestNestedSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "$(H)include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \
   ) \
 )

--- a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp
@ -0,0 +1,241 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+
+#include <utility>  // pair
+
+namespace Test {
+namespace SortImpl {
+
+struct Less {
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs,
+                                         const ValueType &rhs) const {
+    return lhs < rhs;
+  }
+};
+
+struct Greater {
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs,
+                                         const ValueType &rhs) const {
+    return lhs > rhs;
+  }
+};
+
+template <class ExecutionSpace, class Keys, class Permute,
+          class Comparator = Less>
+struct is_sorted_by_key_struct {
+  Keys keys;
+  Keys keys_orig;
+  Permute permute;
+  Comparator comparator;
+
+  is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_,
+                          Comparator comparator_ = Comparator{})
+      : keys(keys_),
+        keys_orig(keys_orig_),
+        permute(permute_),
+        comparator(comparator_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, unsigned int &count) const {
+    if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count;
+    if (keys(i) != keys_orig(permute(i))) ++count;
+  }
+};
+
+template <typename ExecutionSpace, typename ViewType>
+void iota(ExecutionSpace const &space, ViewType const &v,
+          typename ViewType::value_type value = 0) {
+  using ValueType = typename ViewType::value_type;
+  Kokkos::parallel_for(
+      "ArborX::Algorithms::iota",
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, v.extent(0)),
+      KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; });
+}
+
+}  // namespace SortImpl
+
+TEST(TEST_CATEGORY, SortByKeyEmptyView) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  // does not matter if we use int or something else
+  Kokkos::View<int *, ExecutionSpace> keys("keys", 0);
+  Kokkos::View<float *, ExecutionSpace> values("values", 0);
+
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values));
+}
+
+TEST(TEST_CATEGORY, SortByKey) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using MemorySpace    = typename ExecutionSpace::memory_space;
+
+  ExecutionSpace space{};
+
+  for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9},
+                           std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7},
+                           std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7},
+                           std::vector<int>{15, 5, 11, 3, 4, 8}}) {
+    auto const n = keys_vector.size();
+
+    auto keys = Kokkos::create_mirror_view_and_copy(
+        MemorySpace{},
+        Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>(
+            keys_vector.data(), n));
+
+    auto keys_orig = Kokkos::create_mirror(space, keys);
+    Kokkos::deep_copy(space, keys_orig, keys);
+
+    Kokkos::View<int *, ExecutionSpace> permute("permute", n);
+    SortImpl::iota(space, permute);
+
+    Kokkos::Experimental::sort_by_key(space, keys, permute);
+
+    unsigned int sort_fails = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+        SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys),
+                                          decltype(permute)>(keys, keys_orig,
+                                                             permute),
+        sort_fails);
+
+    ASSERT_EQ(sort_fails, 0u);
+  }
+}
+
+TEST(TEST_CATEGORY, SortByKeyWithComparator) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using MemorySpace    = typename ExecutionSpace::memory_space;
+
+  ExecutionSpace space{};
+
+  SortImpl::Greater comparator;
+
+  for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9},
+                           std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7},
+                           std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7},
+                           std::vector<int>{15, 5, 11, 3, 4, 8}}) {
+    auto const n = keys_vector.size();
+
+    auto keys = Kokkos::create_mirror_view_and_copy(
+        MemorySpace{},
+        Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>(
+            keys_vector.data(), n));
+
+    auto keys_orig = Kokkos::create_mirror(space, keys);
+    Kokkos::deep_copy(space, keys_orig, keys);
+
+    Kokkos::View<int *, ExecutionSpace> permute("permute", n);
+    SortImpl::iota(space, permute);
+
+    Kokkos::Experimental::sort_by_key(space, keys, permute, comparator);
+
+    unsigned int sort_fails = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+        SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys),
+                                          decltype(permute), SortImpl::Greater>(
+            keys, keys_orig, permute, comparator),
+        sort_fails);
+
+    ASSERT_EQ(sort_fails, 0u);
+  }
+}
+
+TEST(TEST_CATEGORY, SortByKeyStaticExtents) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  ExecutionSpace space{};
+
+  Kokkos::View<int[10], ExecutionSpace> keys("keys");
+
+  Kokkos::View<int[10], ExecutionSpace> values_static("values_static");
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(space, keys, values_static));
+
+  Kokkos::View<int *, ExecutionSpace> values_dynamic("values_dynamic", 10);
+  ASSERT_NO_THROW(
+      Kokkos::Experimental::sort_by_key(space, keys, values_dynamic));
+}
+
+template <typename ExecutionSpace, typename Keys, typename Values>
+void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys,
+                          Values &values) {
+  Kokkos::parallel_for(
+      "create_data",
+      Kokkos::MDRangePolicy<Kokkos::Rank<3>, ExecutionSpace>(space, {0, 0, 0},
+                                                             {n, n, n}),
+      KOKKOS_LAMBDA(int i, int j, int k) {
+        keys(i, j, k)   = n - i;
+        values(i, j, k) = j;
+      });
+}
+
+TEST(TEST_CATEGORY, SortByKeyWithStrides) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  ExecutionSpace space{};
+
+  auto const n = 10;
+
+  Kokkos::View<int ***, ExecutionSpace> keys("keys", n, n, n);
+  Kokkos::View<int ***, ExecutionSpace> values("values", n, n, n);
+  buildViewsForStrided(space, n, keys, values);
+
+  auto keys_sub   = Kokkos::subview(keys, Kokkos::ALL(), 1, 2);
+  auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6);
+
+  auto keys_orig = Kokkos::create_mirror(space, keys_sub);
+  Kokkos::deep_copy(space, keys_orig, keys_sub);
+
+  Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub);
+
+  unsigned int sort_fails = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(space, 0, n),
+      SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys_sub),
+                                        decltype(values_sub)>(
+          keys_sub, keys_orig, values_sub),
+      sort_fails);
+
+  ASSERT_EQ(sort_fails, 0u);
+}
+
+TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  // does not matter if we use int or something else
+  Kokkos::View<int *, ExecutionSpace> keys("keys", 3);
+  Kokkos::View<float *, ExecutionSpace> values("values", 1);
+
+  ASSERT_DEATH(
+      Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values),
+      "values and keys extents must be the same");
+  ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values,
+                                                 SortImpl::Greater{}),
+               "values and keys extents must be the same");
+}
+
+}  // namespace Test
+#endif
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result(
  // set accum to 1 if a mismach is found
  const bool mismatch = memberValue != target;
  int accum           = static_cast<int>(mismatch);
-  // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and
-  // ignores the reducer passed
-#if defined KOKKOS_ENABLE_OPENMPTARGET
-  Kokkos::Sum<int> dummyReducer(accum);
-  const auto result = teamHandle.team_reduce(accum, dummyReducer);
-  return (result == 0);
-#else
  teamHandle.team_reduce(Kokkos::Sum<int>(accum));
  return (accum == 0);
-#endif
 }

 template <class ValueType1, class ValueType2>
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
@ -16,6 +16,7 @@

 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>

 namespace Test {
 namespace stdalgos {
@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init,
  }
 }

-template <class ViewType1, class ViewType2, class ValueType, class BinaryOp>
-void verify_data(ViewType1 data_view,  // contains data
-                 ViewType2 test_view,  // the view to test
-                 ValueType init_value, BinaryOp bop) {
-  //! always careful because views might not be deep copyable
-
-  auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
-  auto data_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
-
-  using gold_view_value_type = typename ViewType2::value_type;
-  Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
-      "goldh", data_view.extent(0));
-  my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
-                         KE::begin(gold_h), init_value, bop);
-
-  auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
-  auto test_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
-  if (test_view_h.extent(0) > 0) {
-    for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-      if (std::is_same<gold_view_value_type, int>::value) {
-        ASSERT_EQ(gold_h(i), test_view_h(i));
-      } else {
-        const auto error =
-            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
-                    << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
-      }
-    }
-  }
-}
-
 template <class ValueType>
 struct MultiplyFunctor {
  KOKKOS_INLINE_FUNCTION
@ -189,107 +149,153 @@ struct SumFunctor {
  }
 };

+struct VerifyData {
+  template <class ViewType1, class ViewType2, class ValueType, class BinaryOp>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  ValueType init_value, BinaryOp bop) {
+    //! always careful because views might not be deep copyable
+
+    auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
+    auto data_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
+
+    using gold_view_value_type = typename ViewType2::value_type;
+    Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
+        "goldh", data_view.extent(0));
+    my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
+                           KE::begin(gold_h), init_value, bop);
+
+    auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
+    auto test_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
+    if (test_view_h.extent(0) > 0) {
+      for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
+        if (std::is_same<gold_view_value_type, int>::value) {
+          ASSERT_EQ(gold_h(i), test_view_h(i));
+        } else {
+          const auto error =
+              std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
+          ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                  << static_cast<double>(test_view_h(i)) << " "
+                                  << static_cast<double>(gold_h(i));
+        }
+      }
+    }
+  }
+
+  template <class ViewType1, class ViewType2, class ValueType>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  ValueType init_value) {
+    (*this)(data_view, test_view, init_value, SumFunctor<ValueType>());
+  }
+};
+
 std::string value_type_to_string(int) { return "int"; }

 std::string value_type_to_string(double) { return "double"; }

-template <class Tag, class ValueType, class InfoType>
-void run_single_scenario_default_op(const InfoType& scenario_info,
-                                    ValueType init_value) {
-  using default_op           = SumFunctor<ValueType>;
+template <class Tag, class ValueType, class InfoType, class... OpOrEmpty>
+void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
+                         OpOrEmpty... empty_or_op) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "exclusive_scan default op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;

  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
  auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
  fill_view(view_from, name);
+  // view_dest is filled with zeros before calling the algorithm everytime to
+  // ensure the algorithm does something meaningful

  {
    fill_zero(view_dest);
    auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value);
+                                init_value, empty_or_op...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
  }

  {
    fill_zero(view_dest);
    auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value);
+                                init_value, empty_or_op...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
  }

  {
    fill_zero(view_dest);
-    auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value);
+    auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value,
+                                empty_or_op...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
  }

  {
    fill_zero(view_dest);
    auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
-                                init_value);
+                                init_value, empty_or_op...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, default_op());
+    VerifyData()(view_from, view_dest, init_value, empty_or_op...);
  }

  Kokkos::fence();
 }

-template <class Tag, class ValueType, class InfoType, class BinaryOp>
-void run_single_scenario_custom_op(const InfoType& scenario_info,
-                                   ValueType init_value, BinaryOp bop) {
+template <class Tag, class ValueType, class InfoType, class... OpOrEmpty>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 ValueType init_value,
+                                 OpOrEmpty... empty_or_op) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "exclusive_scan custom op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;

-  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
-  auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan");
-  fill_view(view_from, name);
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place

+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view1");
+  fill_view(view1, name);
+
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view2");
  {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2),
+                                KE::begin(view2), init_value, empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
  }

  {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest),
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2),
+                                KE::cend(view2), KE::begin(view2), init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
  }

  {
-    fill_zero(view_dest);
-    auto r =
-        KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan(exespace(), view2, view2, init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
  }

  {
-    fill_zero(view_dest);
-    auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
-                                init_value, bop);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, init_value, bop);
+    fill_view(view2, name);
+    auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value,
+                                empty_or_op...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, init_value, empty_or_op...);
  }

  Kokkos::fence();
@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() {
      {"medium", 1103},      {"large", 10513}};

  for (const auto& it : scenarios) {
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{0});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{1});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{-2});
-    run_single_scenario_default_op<Tag, ValueType>(it, ValueType{3});
+    run_single_scenario<Tag, ValueType>(it, ValueType{0});
+    run_single_scenario<Tag, ValueType>(it, ValueType{1});
+    run_single_scenario<Tag, ValueType>(it, ValueType{-2});
+    run_single_scenario<Tag, ValueType>(it, ValueType{3});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2});

 #if !defined KOKKOS_ENABLE_OPENMPTARGET
    // custom multiply op is only run for small views otherwise it overflows
    if (it.first == "small-a" || it.first == "small-b") {
      using custom_bop_t = MultiplyFunctor<ValueType>;
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2},
-                                                    custom_bop_t());
-      run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3},
-                                                    custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t());
+      run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t());
+
+      run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0},
+                                                  custom_bop_t());
+      run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2},
+                                                  custom_bop_t());
    }

    using custom_bop_t = SumFunctor<ValueType>;
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0},
-                                                  custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1},
-                                                  custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2},
-                                                  custom_bop_t());
-    run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3},
-                                                  custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t());
+    run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t());
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0},
+                                                custom_bop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2},
+                                                custom_bop_t());
 #endif
  }
 }
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
@ -16,6 +16,7 @@

 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>

 namespace Test {
 namespace stdalgos {
@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop,
  }
 }

-template <class ViewType1, class ViewType2, class BinaryOp, class... Args>
-void verify_data(ViewType1 data_view,  // contains data
-                 ViewType2 test_view,  // the view to test
-                 BinaryOp bop, Args... args /* copy on purpose */) {
-  //! always careful because views might not be deep copyable
-
-  auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
-  auto data_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
-
-  using gold_view_value_type = typename ViewType2::value_type;
-  Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
-      "goldh", data_view.extent(0));
-  my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
-                         KE::begin(gold_h), bop, args...);
-
-  auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
-  auto test_view_h =
-      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
-
-  const auto ext = test_view_h.extent(0);
-  if (ext > 0) {
-    for (std::size_t i = 0; i < ext; ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
-      if (std::is_same<gold_view_value_type, int>::value) {
-        ASSERT_EQ(gold_h(i), test_view_h(i));
-      } else {
-        const auto error =
-            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
-                    << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
-      }
-    }
-    // std::cout << " last el: " << test_view_h(ext-1) << std::endl;
-  }
-}
-
 template <class ValueType>
 struct MultiplyFunctor {
  KOKKOS_INLINE_FUNCTION
@ -204,107 +160,151 @@ struct SumFunctor {
  }
 };

+struct VerifyData {
+  template <class ViewType1, class ViewType2, class BinaryOp, class... Args>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view,  // the view to test
+                  BinaryOp bop, Args... args /* copy on purpose */) {
+    //! always careful because views might not be deep copyable
+
+    auto data_view_dc = create_deep_copyable_compatible_clone(data_view);
+    auto data_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc);
+
+    using gold_view_value_type = typename ViewType2::value_type;
+    Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h(
+        "goldh", data_view.extent(0));
+    my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h),
+                           KE::begin(gold_h), bop, args...);
+
+    auto test_view_dc = create_deep_copyable_compatible_clone(test_view);
+    auto test_view_h =
+        create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
+
+    const auto ext = test_view_h.extent(0);
+    if (ext > 0) {
+      for (std::size_t i = 0; i < ext; ++i) {
+        if (std::is_same<gold_view_value_type, int>::value) {
+          ASSERT_EQ(gold_h(i), test_view_h(i));
+        } else {
+          const auto error =
+              std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
+          ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                  << static_cast<double>(test_view_h(i)) << " "
+                                  << static_cast<double>(gold_h(i));
+        }
+      }
+    }
+  }
+
+  template <class ViewType1, class ViewType2>
+  void operator()(ViewType1 data_view,  // contains data
+                  ViewType2 test_view)  // the view to test
+  {
+    using value_type = typename ViewType1::non_const_value_type;
+    (*this)(data_view, test_view, SumFunctor<value_type>());
+  }
+};
+
 std::string value_type_to_string(int) { return "int"; }
 std::string value_type_to_string(double) { return "double"; }

-template <class Tag, class ValueType, class InfoType>
-void run_single_scenario_default_op(const InfoType& scenario_info) {
-  using default_op           = SumFunctor<ValueType>;
+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario(const InfoType& scenario_info,
+                         Args... args /* copy on purpose */) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "inclusive_scan default op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << std::endl;

  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
  auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
  fill_view(view_from, name);
+  // view_dest is filled with zeros before calling the algorithm everytime to
+  // ensure the algorithm does something meaningful

  {
    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest));
+    auto r =
+        KE::inclusive_scan(exespace(), KE::cbegin(view_from),
+                           KE::cend(view_from), KE::begin(view_dest), args...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
  }

  {
    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest));
+    auto r =
+        KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
+                           KE::cend(view_from), KE::begin(view_dest), args...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
  }

  {
    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), view_from, view_dest);
+    auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
  }

  {
    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest);
+    auto r =
+        KE::inclusive_scan("label", exespace(), view_from, view_dest, args...);
    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, default_op());
+    VerifyData()(view_from, view_dest, args...);
  }

  Kokkos::fence();
 }

-template <class Tag, class ValueType, class InfoType, class BinaryOp,
-          class... Args>
-void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
-                                   Args... args /* copy on purpose */) {
+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 Args... args /* copy on purpose */) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);

-  // if (1 == sizeof...(Args)) {
-  //   std::cout << "inclusive_scan custom op and init value: " << name << ", "
-  //             << view_tag_to_string(Tag{}) << ", "
-  //             << value_type_to_string(ValueType()) << ", " << std::endl;
-  // } else {
-  //   std::cout << "inclusive_scan custom op: " << name << ", "
-  //             << view_tag_to_string(Tag{}) << ", "
-  //             << value_type_to_string(ValueType()) << ", " << std::endl;
-  // }
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view_2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place

-  auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
-  auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan");
-  fill_view(view_from, name);
+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view1");
+  fill_view(view1, name);
+
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view2");

  {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest), bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2),
+                                KE::begin(view2), args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
  }

  {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
-                                KE::cend(view_from), KE::begin(view_dest), bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2),
+                                KE::cend(view2), KE::begin(view2), args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
  }

  {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan(exespace(), view2, view2, args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
  }

  {
-    fill_zero(view_dest);
-    auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop,
-                                args...);
-    ASSERT_EQ(r, KE::end(view_dest));
-    verify_data(view_from, view_dest, bop, args...);
+    fill_view(view2, name);
+    auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...);
+    ASSERT_EQ(r, KE::end(view2));
+    VerifyData()(view1, view2, args...);
  }

  Kokkos::fence();
@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() {
      {"medium-a", 313},     {"medium-b", 1103}, {"large", 10513}};

  for (const auto& it : scenarios) {
-    run_single_scenario_default_op<Tag, ValueType>(it);
+    run_single_scenario<Tag, ValueType>(it);
+    run_single_scenario_inplace<Tag, ValueType>(it);

 #if !defined KOKKOS_ENABLE_OPENMPTARGET
    // the sum custom op is always run
    using sum_binary_op = SumFunctor<ValueType>;
    sum_binary_op sbop;
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop);
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{0});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{1});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{-2});
-    run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{3});
+    run_single_scenario<Tag, ValueType>(it, sbop);
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{0});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{1});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{-2});
+    run_single_scenario<Tag, ValueType>(it, sbop, ValueType{3});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{-2});

    // custom multiply only for small views to avoid overflows
    if (it.first == "small-a" || it.first == "small-b") {
      using mult_binary_op = MultiplyFunctor<ValueType>;
      mult_binary_op mbop;
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop);
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{0});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{1});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{-2});
-      run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{3});
+      run_single_scenario<Tag, ValueType>(it, mbop);
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{0});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{1});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{-2});
+      run_single_scenario<Tag, ValueType>(it, mbop, ValueType{3});
+
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop);
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{0});
+      run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{-2});
    }
 #endif
  }
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) {
  resultsA[3]     = KE::is_sorted("label", exespace(), view);
  const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(),
                                [=](bool v) { return v == gold; });
-  EXPECT_TRUE(allA);
+  EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{});

 #if !defined KOKKOS_ENABLE_OPENMPTARGET
  CustomLessThanComparator<ValueType, ValueType> comp;
@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) {
  resultsB[3]     = KE::is_sorted("label", exespace(), view, comp);
  const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(),
                                [=](bool v) { return v == gold; });
-  EXPECT_TRUE(allB);
+  EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{});
 #endif

  Kokkos::fence();
@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() {
      {"medium-a", 1003},    {"medium-b", 1003}, {"large-a", 101513},
      {"large-b", 101513}};

-  std::cout << "is_sorted: " << view_tag_to_string(Tag{})
-            << ", all overloads \n";
-
  for (const auto& it : scenarios) {
    run_single_scenario<Tag, ValueType>(it);
  }
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) {
      KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view));
  auto r3 = KE::is_sorted_until(exespace(), view);
  auto r4 = KE::is_sorted_until("label", exespace(), view);
-  ASSERT_EQ(r1, gold);
-  ASSERT_EQ(r2, gold);
-  ASSERT_EQ(r3, gold);
-  ASSERT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{});

 #if !defined KOKKOS_ENABLE_OPENMPTARGET
  CustomLessThanComparator<ValueType, ValueType> comp;
@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) {
  auto r8 = KE::is_sorted_until("label", exespace(), view, comp);
 #endif

-  ASSERT_EQ(r1, gold);
-  ASSERT_EQ(r2, gold);
-  ASSERT_EQ(r3, gold);
-  ASSERT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{});
+  ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{});

  Kokkos::fence();
 }
@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() {
      {"medium-a", 1003},    {"medium-b", 1003}, {"large-a", 101513},
      {"large-b", 101513}};

-  std::cout << "is_sorted_until: " << view_tag_to_string(Tag{})
-            << ", all overloads \n";
-
  for (const auto& it : scenarios) {
    run_single_scenario<Tag, ValueType>(it);
  }
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
@ -48,7 +48,7 @@ struct MyMovableType {
 TEST(std_algorithms_mod_ops_test, move) {
  MyMovableType a;
  using move_t = decltype(std::move(a));
-  static_assert(std::is_rvalue_reference<move_t>::value, "");
+  static_assert(std::is_rvalue_reference<move_t>::value);

  // move constr
  MyMovableType b(std::move(a));
@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove {
  void operator()(const int index) const {
    typename ViewType::value_type a{11};
    using move_t = decltype(std::move(a));
-    static_assert(std::is_rvalue_reference<move_t>::value, "");
+    static_assert(std::is_rvalue_reference<move_t>::value);
    m_view(index) = std::move(a);
  }

@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) {
  }
 }

-// ------------
-// swap
-// ------------
-TEST(std_algorithms_mod_ops_test, swap) {
-  {
-    int a = 1;
-    int b = 2;
-    KE::swap(a, b);
-    ASSERT_EQ(a, 2);
-    ASSERT_EQ(b, 1);
-  }
-
-  {
-    double a = 3.;
-    double b = 1.;
-    KE::swap(a, b);
-    EXPECT_DOUBLE_EQ(a, 1.);
-    EXPECT_DOUBLE_EQ(b, 3.);
-  }
-}
-
-template <class ViewType>
-struct StdAlgoModSeqOpsTestSwap {
-  ViewType m_view;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int index) const {
-    typename ViewType::value_type newval{11};
-    KE::swap(m_view(index), newval);
-  }
-
-  StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {}
-};
-
-TEST(std_algorithms_mod_ops_test, swap_within_parfor) {
-  auto a = create_view<double>(stdalgos::DynamicTag{}, 10, "a");
-  StdAlgoModSeqOpsTestSwap<decltype(a)> fnc(a);
-  Kokkos::parallel_for(a.extent(0), fnc);
-  auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a);
-  for (std::size_t i = 0; i < a.extent(0); ++i) {
-    EXPECT_DOUBLE_EQ(a_h(0), 11.);
-  }
-}
-
 // ------------
 // iter_swap
 // ------------
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result,
                 ViewTypeDestFalse view_dest_false, PredType pred) {
  using value_type = typename ViewTypeFrom::value_type;
  static_assert(
-      std::is_same<value_type, typename ViewTypeDestTrue::value_type>::value,
-      "");
+      std::is_same<value_type, typename ViewTypeDestTrue::value_type>::value);
  static_assert(
-      std::is_same<value_type, typename ViewTypeDestFalse::value_type>::value,
-      "");
+      std::is_same<value_type, typename ViewTypeDestFalse::value_type>::value);

  const std::size_t ext = view_from.extent(0);

--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
@ -166,6 +166,10 @@ void run_all_scenarios() {
 }

 TEST(std_algorithms_copy_if_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
@ -121,7 +121,9 @@ struct TestFunctorA {
  }
 };

-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  /* description:
     use a rank-2 view randomly filled with values,
@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  using space_t = Kokkos::DefaultExecutionSpace;
  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());

-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
  // exclusive_scan returns an iterator so to verify that it is correct
  // each team stores the distance of the returned iterator from the beginning
  // of the interval that team operates on and then we check that these
@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  rand_pool pool(lowerBound * upperBound);
  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);

-  // use CTAD for functor
  auto initValuesView =
      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }

  // -----------------------------------------------
  // run cpp-std kernel and check
@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef exclusive_scan
  }

-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }

-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
  for (int numTeams : teamSizesToTest) {
    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
@ -236,16 +247,24 @@ void run_all_scenarios() {
 #else
      for (int apiId : {0, 1}) {
 #endif
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
      }
    }
  }
 }

 TEST(std_algorithms_exclusive_scan_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }

 }  // namespace TeamExclusiveScan
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
@ -139,7 +139,9 @@ struct TestFunctorA {
  }
 };

-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  /* description:
     use a rank-2 view randomly filled with values,
@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  using space_t = Kokkos::DefaultExecutionSpace;
  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());

-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
  // inclusive_scan returns an iterator so to verify that it is correct
  // each team stores the distance of the returned iterator from the beginning
  // of the interval that team operates on and then we check that these
@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  rand_pool pool(lowerBound * upperBound);
  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);

-  // use CTAD for functor
  auto initValuesView =
      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }

  // -----------------------------------------------
  // run cpp-std kernel and check
@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef inclusive_scan
  }

-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }

-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
  for (int numTeams : teamSizesToTest) {
    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
      for (int apiId : {0, 1, 2, 3, 4, 5}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
      }
    }
  }
 }

 TEST(std_algorithms_inclusive_scan_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }

 }  // namespace TeamInclusiveScan
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
@ -212,6 +212,10 @@ void run_all_scenarios() {
 }

 TEST(std_algorithms_remove_copy_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
@ -168,6 +168,10 @@ void run_all_scenarios() {
 }

 TEST(std_algorithms_remove_copy_if_team_test, test) {
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
@ -108,7 +108,9 @@ struct TestFunctorA {
  }
 };

-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  /* description:
     use a rank-2 view randomly filled with values,
@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  using space_t = Kokkos::DefaultExecutionSpace;
  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());

-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
  // tranform_exclusive_scan returns an iterator so to verify that it is correct
  // each team stores the distance of the returned iterator from the beginning
  // of the interval that team operates on and then we check that these
@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  rand_pool pool(lowerBound * upperBound);
  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);

-  // use CTAD for functor
  auto initValuesView =
      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, unaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, unaryOp,
+                     apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, unaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }

  // -----------------------------------------------
  // run cpp-std kernel and check
@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
 #undef transform_exclusive_scan
  }

-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }

-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
  for (int numTeams : teamSizesToTest) {
    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
      for (int apiId : {0, 1}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
      }
    }
  }
@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) {
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }

 }  // namespace TeamTransformExclusiveScan
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
@ -131,7 +131,9 @@ struct TestFunctorA {
  }
 };

-template <class LayoutTag, class ValueType>
+struct InPlace {};
+
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  /* description:
     use a rank-2 view randomly filled with values,
@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  using space_t = Kokkos::DefaultExecutionSpace;
  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());

-  // create the destination view
-  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
-
  // tranform_inclusive_scan returns an iterator so to verify that it is correct
  // each team stores the distance of the returned iterator from the beginning
  // of the interval that team operates on and then we check that these
@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  rand_pool pool(lowerBound * upperBound);
  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);

-  // use CTAD for functor
  auto initValuesView =
      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
-  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
-                   initValuesView, binaryOp, unaryOp, apiId);
-  Kokkos::parallel_for(policy, fnc);
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    TestFunctorA fnc(sourceView, sourceView, distancesView,
+                     intraTeamSentinelView, initValuesView, binaryOp, unaryOp,
+                     apiId);
+    Kokkos::parallel_for(policy, fnc);
+  } else {
+    TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                     initValuesView, binaryOp, unaryOp, apiId);
+    Kokkos::parallel_for(policy, fnc);
+  }

  // -----------------------------------------------
  // run cpp-std kernel and check
@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
  }
 #undef transform_inclusive_scan

-  auto dataViewAfterOp_h = create_host_space_copy(destView);
-  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) {
+    auto dataViewAfterOp_h = create_host_space_copy(sourceView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  } else {
+    auto dataViewAfterOp_h = create_host_space_copy(destView);
+    expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+  }
 }

-template <class LayoutTag, class ValueType>
+template <class LayoutTag, class ValueType, class InPlaceOrVoid = void>
 void run_all_scenarios() {
  for (int numTeams : teamSizesToTest) {
    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
      for (int apiId : {0, 1, 2, 3}) {
-        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+        test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId);
      }
    }
  }
@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) {
  run_all_scenarios<DynamicTag, double>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+
+  run_all_scenarios<DynamicTag, double, InPlace>();
+  run_all_scenarios<StridedTwoRowsTag, int, InPlace>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>();
 }

 }  // namespace TeamTransformInclusiveScan
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
@ -186,6 +186,10 @@ void run_all_scenarios() {
 }

 TEST(std_algorithms_unique_copy_team_test, test) {
+  // FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)
+  GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs";
+#endif
  run_all_scenarios<DynamicTag, int>();
  run_all_scenarios<StridedTwoRowsTag, int>();
  run_all_scenarios<StridedThreeRowsTag, int>();
--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
@ -16,6 +16,7 @@

 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>

 namespace Test {
 namespace stdalgos {
@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view,  // contains data
      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
  if (test_view_h.extent(0) > 0) {
    for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
      if (std::is_same<gold_view_value_type, int>::value) {
        ASSERT_EQ(gold_h(i), test_view_h(i));
      } else {
        const auto error = std::abs(gold_h(i) - test_view_h(i));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
+        ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                << static_cast<double>(test_view_h(i)) << " "
+                                << static_cast<double>(gold_h(i));
      }
    }
-    // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
-    // std::endl;
  }
 }

@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
                         BinaryOp bop, UnaryOp uop) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);
-  // std::cout << "transform_exclusive_scan custom op: " << name << ", "
-  //           << view_tag_to_string(Tag{}) << ", "
-  //           << value_type_to_string(ValueType()) << ", "
-  //           << "init = " << init_value << std::endl;

-  auto view_dest =
-      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan");
-  auto view_from =
-      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan");
+  auto view_from = create_view<ValueType>(Tag{}, view_ext,
+                                          "transform_exclusive_scan_view_from");
  fill_view(view_from, name);

+  auto view_dest = create_view<ValueType>(Tag{}, view_ext,
+                                          "transform_exclusive_scan_view_dest");
  {
    fill_zero(view_dest);
    auto r = KE::transform_exclusive_scan(
@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
  Kokkos::fence();
 }

+template <class Tag, class ValueType, class InfoType, class BinaryOp,
+          class UnaryOp>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 ValueType init_value, BinaryOp bop,
+                                 UnaryOp uop) {
+  const auto name            = std::get<0>(scenario_info);
+  const std::size_t view_ext = std::get<1>(scenario_info);
+
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to what the scenario asks for and is not modified
+  // view2: filled according to what the scenario asks for and used for the
+  // in-place op Therefore, after the op is done, view2 should contain the
+  // result of doing exclusive scan NOTE: view2 is filled below every time
+  // because the algorithm acts in place
+
+  auto view1 =
+      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view1");
+  fill_view(view1, name);
+
+  auto view2 =
+      create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view2");
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2),
+                                          KE::cend(view2), KE::begin(view2),
+                                          init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(
+        "label", exespace(), KE::cbegin(view2), KE::cend(view2),
+        KE::begin(view2), init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value,
+                                          bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  {
+    fill_view(view2, name);
+    auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2,
+                                          init_value, bop, uop);
+    ASSERT_EQ(r, KE::end(view2));
+    verify_data(view1, view2, init_value, bop, uop);
+  }
+
+  Kokkos::fence();
+}
+
 template <class Tag, class ValueType>
 void run_all_scenarios() {
  const std::map<std::string, std::size_t> scenarios = {
@ -267,6 +314,11 @@ void run_all_scenarios() {
    run_single_scenario<Tag, ValueType>(it, ValueType{1}, bop_t(), uop_t());
    run_single_scenario<Tag, ValueType>(it, ValueType{-2}, bop_t(), uop_t());
    run_single_scenario<Tag, ValueType>(it, ValueType{3}, bop_t(), uop_t());
+
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}, bop_t(),
+                                                uop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}, bop_t(),
+                                                uop_t());
  }
 }

--- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
@ -16,6 +16,7 @@

 #include <TestStdAlgorithmsCommon.hpp>
 #include <utility>
+#include <iomanip>

 namespace Test {
 namespace stdalgos {
@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view,  // contains data
      create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc);
  if (test_view_h.extent(0) > 0) {
    for (std::size_t i = 0; i < test_view_h.extent(0); ++i) {
-      // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " "
-      //           << gold_h(i) << " " << test_view_h(i) << " "
-      //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-
      if (std::is_same<gold_view_value_type, int>::value) {
        ASSERT_EQ(gold_h(i), test_view_h(i));
      } else {
        const auto error = std::abs(gold_h(i) - test_view_h(i));
-        if (error > 1e-10) {
-          std::cout << i << " " << std::setprecision(15) << data_view_h(i)
-                    << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
-        }
-        EXPECT_LT(error, 1e-10);
+        ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error
+                                << static_cast<double>(test_view_h(i)) << " "
+                                << static_cast<double>(gold_h(i));
      }
    }
-    // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
-    // std::endl;
  }
 }

@ -210,30 +202,11 @@ struct SumBinaryFunctor {
 std::string value_type_to_string(int) { return "int"; }
 std::string value_type_to_string(double) { return "double"; }

-template <class Tag, class BopT, class UopT>
-void print_scenario_details(const std::string& name, BopT bop, UopT uop) {
-  (void)bop;
-  (void)uop;
-  std::cout << "transform_inclusive_scan: " << name << ", "
-            << view_tag_to_string(Tag{}) << std::endl;
-}
-
-template <class Tag, class BopT, class UopT, class ValueType>
-void print_scenario_details(const std::string& name, BopT bop, UopT uop,
-                            ValueType init_value) {
-  (void)bop;
-  (void)uop;
-  std::cout << "transform_inclusive_scan: " << name << ", "
-            << view_tag_to_string(Tag{}) << ", "
-            << "init = " << init_value << std::endl;
-}
-
 template <class Tag, class ValueType, class InfoType, class... Args>
 void run_single_scenario(const InfoType& scenario_info,
                         Args... args /* by value on purpose*/) {
  const auto name            = std::get<0>(scenario_info);
  const std::size_t view_ext = std::get<1>(scenario_info);
-  // print_scenario_details<Tag>(name, args...);

  auto view_dest =
      create_view<ValueType>(Tag{}, view_ext, "transform_inclusive_scan");
@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info,
  Kokkos::fence();
 }

+template <class Tag, class ValueType, class InfoType, class... Args>
+void run_single_scenario_inplace(const InfoType& scenario_info,
+                                 Args... args /* by value on purpose*/) {
+  const auto name            = std::get<0>(scenario_info);
+  const std::size_t view_ext = std::get<1>(scenario_info);
+
+  // since here we call the in-place operation, we need to use two views:
+  // view1: filled according to scenario and is not modified
+  // view2: filled according scenario and used for the in-place op
+  // Therefore, after the op is done, view_2 should contain the
+  // result of doing exclusive scan.
+  // NOTE: view2 must be filled before every call to the algorithm
+  // because the algorithm acts in place
+
+  auto view_1 = create_view<ValueType>(Tag{}, view_ext,
+                                       "transform_inclusive_scan_view_1");
+  fill_view(view_1, name);
+
+  auto view_2 = create_view<ValueType>(Tag{}, view_ext,
+                                       "transform_inclusive_scan_view_2");
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2),
+                                          KE::cend(view_2), KE::begin(view_2),
+                                          args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan("label", exespace(),
+                                          KE::cbegin(view_2), KE::cend(view_2),
+                                          KE::begin(view_2), args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  {
+    fill_view(view_2, name);
+    auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2,
+                                          args...);
+    ASSERT_EQ(r, KE::end(view_2));
+    verify_data(view_1, view_2, args...);
+  }
+
+  Kokkos::fence();
+}
+
 template <class Tag, class ValueType>
 void run_all_scenarios() {
  const std::map<std::string, std::size_t> scenarios = {
@ -294,15 +324,23 @@ void run_all_scenarios() {
    run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{2});
    run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-1});
    run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-2});
+
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t());
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{0});
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{2});
+    run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(),
+                                                ValueType{-2});
  }
 }

 #if !defined KOKKOS_ENABLE_OPENMPTARGET
 TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) {
  run_all_scenarios<DynamicTag, double>();
-  // run_all_scenarios<StridedThreeTag, double>();
-  // run_all_scenarios<DynamicTag, int>();
-  // run_all_scenarios<StridedThreeTag, int>();
+  run_all_scenarios<StridedThreeTag, double>();
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedThreeTag, int>();
 }
 #endif

--- a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp
@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) {
  static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value,
                "test is only enabled for HostSpace");

-  std::cout << "checking reduction with order: " << order_to_string(enValue)
-            << "\n";
-
  using view_value_type = typename ViewType::value_type;
  using reducer_type    = std::conditional_t<
      (flag == 0), Kokkos::MaxFirstLoc<view_value_type, IndexType, ExeSpace>,
@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) {

  const auto pair1 = run_min_or_max_test<0, hostspace, index_type>(
      view_h, StdReducersTestEnumOrder::LeftToRight);
-  ASSERT_EQ(pair1.first, gold_value);
-  ASSERT_EQ(pair1.second, gold_location);
+  ASSERT_EQ(pair1.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::LeftToRight);
+  ASSERT_EQ(pair1.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::LeftToRight);

  const auto pair2 = run_min_or_max_test<0, hostspace, index_type>(
      view_h, StdReducersTestEnumOrder::RightToLeft);
-  ASSERT_EQ(pair2.first, gold_value);
-  ASSERT_EQ(pair2.second, gold_location);
+  ASSERT_EQ(pair2.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::RightToLeft);
+  ASSERT_EQ(pair2.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::RightToLeft);

  const auto pair3 = run_min_or_max_test<0, hostspace, index_type>(
      view_h, StdReducersTestEnumOrder::Random);
-  ASSERT_EQ(pair3.first, gold_value);
-  ASSERT_EQ(pair3.second, gold_location);
+  ASSERT_EQ(pair3.first, gold_value)
+      << order_to_string(StdReducersTestEnumOrder::Random);
+  ASSERT_EQ(pair3.second, gold_location)
+      << order_to_string(StdReducersTestEnumOrder::Random);
 }

 TEST(std_algorithms_reducers, min_first_loc) {
@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue,
  static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value,
                "test is only enabled for HostSpace");

-  std::cout << "checking reduction with order: " << order_to_string(enValue)
-            << "\n";
-
  using view_value_type = typename ViewType::value_type;
  using reducer_type =
      Kokkos::MinMaxFirstLastLoc<view_value_type, IndexType, ExeSpace>;
@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue,
                 reduction_value_type{view(index), view(index), index, index});
  }

-  ASSERT_EQ(red_result.min_val, gold_values.first);
-  ASSERT_EQ(red_result.max_val, gold_values.second);
-  ASSERT_EQ(red_result.min_loc, gold_locs.first);
-  ASSERT_EQ(red_result.max_loc, gold_locs.second);
+  ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue);
+  ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue);
+  ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue);
+  ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue);
 }

 TEST(std_algorithms_reducers, min_max_first_last_loc) {
--- a/lib/kokkos/benchmarks/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/CMakeLists.txt
@ -1 +1,12 @@
+#FIXME_OPENMPTARGET - compiling in debug mode causes ICE.
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather)
 KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency)
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream)
+
+#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow.
+IF(NOT Kokkos_ENABLE_OPENMPTARGET)
+    KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance)
+    KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops)
+ENDIF()
--- a/lib/kokkos/benchmarks/atomic/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/atomic/CMakeLists.txt
@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  atomic
+  SOURCES main.cpp
+)
--- a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt
@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  bytes_and_flops
+  SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp
+)
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
@ -37,22 +37,22 @@ struct RunStride {
 };

 #define STRIDE 1
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 2
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 4
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 8
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 16
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE
 #define STRIDE 32
-#include <bench_stride.hpp>
+#include "bench_stride.hpp"
 #undef STRIDE

 template <class Scalar>
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_double.cpp
@ -14,7 +14,7 @@
 //
 //@HEADER

-#include <bench.hpp>
+#include "bench.hpp"

 template void run_stride_unroll<double>(int N, int K, int R, int D, int U,
                                        int F, int T, int S, int B, int I);
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_float.cpp
@ -14,7 +14,7 @@
 //
 //@HEADER

-#include <bench.hpp>
+#include "bench.hpp"

 template void run_stride_unroll<float>(int N, int K, int R, int D, int U, int F,
                                       int T, int S, int B, int I);
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp
@ -14,7 +14,7 @@
 //
 //@HEADER

-#include <bench.hpp>
+#include "bench.hpp"

 template void run_stride_unroll<int32_t>(int N, int K, int R, int D, int U,
                                         int F, int T, int S, int B, int I);
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp
@ -14,7 +14,7 @@
 //
 //@HEADER

-#include <bench.hpp>
+#include "bench.hpp"

 template void run_stride_unroll<int64_t>(int N, int K, int R, int D, int U,
                                         int F, int T, int S, int B, int I);
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
@ -15,28 +15,28 @@
 //@HEADER

 #define UNROLL 1
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 2
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 3
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 4
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 5
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 6
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 7
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL
 #define UNROLL 8
-#include <bench_unroll_stride.hpp>
+#include "bench_unroll_stride.hpp"
 #undef UNROLL

 template <class Scalar>
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
@ -26,7 +26,7 @@ struct Run<Scalar, UNROLL, STRIDE> {
    Kokkos::deep_copy(C, Scalar(3.5));

    Kokkos::Timer timer;
-    for (int i = 0; i < I; ++i) {
+    for (int iter = 0; iter < I; ++iter) {
      Kokkos::parallel_for(
          "BenchmarkKernel",
          Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)),
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@ -16,7 +16,7 @@

 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
-#include <bench.hpp>
+#include "bench.hpp"
 #include <cstdlib>

 extern template void run_stride_unroll<float>(int, int, int, int, int, int, int,
@ -86,7 +86,7 @@ int main(int argc, char* argv[]) {
    printf("D must be one of 1,2,4,8,16,32\n");
    return 0;
  }
-  if ((P < 1) && (P > 2)) {
+  if ((P < 1) || (P > 4)) {
    printf("P must be one of 1,2,3,4\n");
    return 0;
  }
--- a/lib/kokkos/benchmarks/gather/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/gather/CMakeLists.txt
@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  gather
+  SOURCES main.cpp
+)
--- a/lib/kokkos/benchmarks/gather/gather.hpp
+++ b/lib/kokkos/benchmarks/gather/gather.hpp
@ -20,28 +20,28 @@ struct RunGather {
 };

 #define UNROLL 1
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 2
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 3
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 4
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 5
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 6
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 7
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL
 #define UNROLL 8
-#include <gather_unroll.hpp>
+#include "gather_unroll.hpp"
 #undef UNROLL

 template <class Scalar>
--- a/lib/kokkos/benchmarks/gather/gather_unroll.hpp
+++ b/lib/kokkos/benchmarks/gather/gather_unroll.hpp
@ -138,7 +138,7 @@ struct RunGather<Scalar, UNROLL> {
    printf(
        "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: "
        "%lf GGather/s: %lf\n",
-        sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds,
+        static_cast<int>(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds,
        1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds,
        1.e-9 * gather_ops / seconds);
  }
--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@ -16,7 +16,7 @@

 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
-#include <gather.hpp>
+#include "gather.hpp"
 #include <cstdlib>

 int main(int argc, char* argv[]) {
--- a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt
@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  launch_latency
+  SOURCES launch_latency.cpp
+)
--- a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp
+++ b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp
@ -0,0 +1,283 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/*! \file launch_latency.cpp
+
+    Tests of parallel_for and parallel_reduce latency for different
+   circumstances.
+
+    Three launch kinds are tested: parallel_for, parallel_reduce into scalar,
+   and parallel_reduce into view
+
+   N controls how large the parallel loops is
+   V controls how large the functor is
+   M controls across how many launches the latency is averaged
+   K controls how larege the nested loop is (no larger than V)
+
+    For each launch kind,
+    1. Avg functor dispatch latency: (time to do M launches) / M
+    2. Avg functor completion throughput: (M launches + sync) / M
+    3. Avg functor completion latency: (M (launch + sync)) / M
+*/
+
+#include <Kokkos_Core.hpp>
+
+template <int V>
+struct TestFunctor {
+  double values[V];
+  Kokkos::View<double*> a;
+  int K;
+  TestFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j];
+  }
+};
+
+template <int V>
+struct TestRFunctor {
+  double values[V];
+  Kokkos::View<double*> a;
+  int K;
+  TestRFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, double& lsum) const {
+    for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j];
+    lsum += a(i);
+  }
+};
+
+struct Opts {
+  bool par_for         = true;
+  bool par_reduce      = true;
+  bool par_reduce_view = true;
+};
+
+template <int V>
+void run(int N, int M, int K, const Opts& opts) {
+  std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence,
+      l_red_view_no_fence, l_red_view_fence;
+  {
+    std::ostringstream ostream;
+    ostream << "RunNoFence_" << N << "_" << K << std::endl;
+    l_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunFence_" << N << "_" << K << std::endl;
+    l_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceNoFence_" << N << "_" << K << std::endl;
+    l_red_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceFence_" << N << "_" << K << std::endl;
+    l_red_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl;
+    l_red_view_no_fence = ostream.str();
+  }
+  {
+    std::ostringstream ostream;
+    ostream << "RunReduceViewFence_" << N << "_" << K << std::endl;
+    l_red_view_fence = ostream.str();
+  }
+
+  double result;
+  Kokkos::View<double*> a("A", N);
+  Kokkos::View<double> v_result("result");
+  TestFunctor<V> f(a, K);
+  TestRFunctor<V> rf(a, K);
+  Kokkos::Timer timer;
+
+  // initialize to an obviously wrong value
+  double time_no_fence        = -1;  // launch loop
+  double time_no_fence_fenced = -1;  // launch loop then fence
+  double time_fence           = -1;  // launch&fence loop
+
+  double time_red_no_fence        = -1;
+  double time_red_no_fence_fenced = -1;
+  double time_red_fence           = -1;
+
+  double time_red_view_no_fence        = -1;
+  double time_red_view_no_fence_fenced = -1;
+  double time_red_view_fence           = -1;
+
+  if (opts.par_for) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_for(l_no_fence, N, f);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_for(l_no_fence, N, f);
+    }
+    time_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_for(l_fence, N, f);
+      Kokkos::fence();
+    }
+    time_fence = timer.seconds();
+  }
+
+  if (opts.par_reduce) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_reduce(l_red_no_fence, N, rf, result);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_no_fence, N, rf, result);
+    }
+    time_red_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_red_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_fence, N, rf, result);
+      Kokkos::fence();
+    }
+    time_red_fence = timer.seconds();
+    Kokkos::fence();
+  }
+
+  if (opts.par_reduce_view) {
+    // warmup
+    for (int i = 0; i < 4; ++i) {
+      Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result);
+    }
+    Kokkos::fence();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result);
+    }
+    time_red_view_no_fence = timer.seconds();
+    Kokkos::fence();
+    time_red_view_no_fence_fenced = timer.seconds();
+
+    timer.reset();
+    for (int i = 0; i < M; i++) {
+      Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result);
+      Kokkos::fence();
+    }
+    time_red_view_fence = timer.seconds();
+    Kokkos::fence();
+    timer.reset();
+  }
+
+  const double x = 1.e6 / M;
+  printf("%i %i %i %i", N, V, K, M);
+  if (opts.par_for) {
+    printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence,
+           x * time_no_fence_fenced);
+  }
+  if (opts.par_reduce) {
+    printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence,
+           x * time_red_fence, x * time_red_no_fence_fenced);
+  }
+  if (opts.par_reduce_view) {
+    printf(" parallel_reduce(view): %lf %lf ( %lf )",
+           x * time_red_view_no_fence, x * time_red_view_fence,
+           x * time_red_view_no_fence_fenced);
+  }
+  printf("\n");
+}
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    int N = 10000;
+    int M = 20;
+    int K = 1;
+
+    Opts opts;
+
+    printf("==========================\n");
+    printf("Kokkos Launch Latency Test\n");
+    printf("==========================\n");
+    printf("\n");
+    printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]);
+    printf("Arguments: N M K\n");
+    printf("  N: loop length\n");
+    printf("  M: how many kernels to dispatch\n");
+    printf(
+        "  K: nested loop length (capped by size of functor member array\n\n");
+    printf("Options:\n");
+    printf("  --no-parallel-for:         skip parallel_for benchmark\n");
+    printf("  --no-parallel-reduce:      skip parallel_reduce benchmark\n");
+    printf(
+        "  --no-parallel-reduce-view: skip parallel_reduce into view "
+        "benchmark\n");
+    printf("\n\n");
+    printf("  Output V is the size of the functor member array\n");
+    printf("\n\n");
+
+    for (int i = 1; i < argc; ++i) {
+      const std::string_view arg(argv[i]);
+
+      // anything that doesn't start with --
+      if (arg.size() < 2 ||
+          (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) {
+        if (i == 1)
+          N = atoi(arg.data());
+        else if (i == 2)
+          M = atoi(arg.data());
+        else if (i == 3)
+          K = atoi(arg.data());
+        else {
+          throw std::runtime_error("unexpected argument!");
+        }
+      } else if (arg == "--no-parallel-for") {
+        opts.par_for = false;
+      } else if (arg == "--no-parallel-reduce") {
+        opts.par_reduce = false;
+      } else if (arg == "--no-parallel-reduce-view") {
+        opts.par_reduce_view = false;
+      } else {
+        std::stringstream ss;
+        ss << "unexpected argument \"" << arg << "\" at position " << i;
+        throw std::runtime_error(ss.str());
+      }
+    }
+
+    printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n");
+
+    /* A backend may have different launch strategies for functors of different
+     * sizes: test a variety of functor sizes.*/
+    run<1>(N, M, K <= 1 ? K : 1, opts);
+    run<16>(N, M, K <= 16 ? K : 16, opts);
+    run<200>(N, M, K <= 200 ? K : 200, opts);
+    run<3000>(N, M, K <= 3000 ? K : 3000, opts);
+    run<30000>(N, M, K <= 30000 ? K : 30000, opts);
+  }
+  Kokkos::finalize();
+}
--- a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt
+++ b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt
@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  policy_performance
+  SOURCES main.cpp
+)
--- a/lib/kokkos/benchmarks/policy_performance/main.cpp
+++ b/lib/kokkos/benchmarks/policy_performance/main.cpp
@ -106,8 +106,9 @@ int main(int argc, char* argv[]) {

  Kokkos::parallel_reduce(
      "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1),
-      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team,
-                    double& lval) { lval += 1; },
+      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) {
+        lval += 1;
+      },
      result);

  using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>;
--- a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
+++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
@ -21,13 +21,13 @@ struct ParallelScanFunctor {
  using value_type = double;
  ViewType v;

-  ParallelScanFunctor(const ViewType& v_) : v(v_) {}
+  explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator()(const int idx, value_type& val, const bool& final) const {
+  void operator()(const int idx, value_type& val, const bool& is_final) const {
    // inclusive scan
    val += v(idx);
-    if (final) {
+    if (is_final) {
      v(idx) = val;
    }
  }
@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range,
                      vector_result = 0.0;
                      Kokkos::parallel_reduce(
                          Kokkos::ThreadVectorRange(team, vector_range),
-                          [&](const int vi, double& vval) { vval += 1; },
+                          [&](const int, double& vval) { vval += 1; },
                          vector_result);
                    }
                    v2(idx, t) = vector_result;
@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range,
              team_result = 0.0;
              Kokkos::parallel_reduce(
                  Kokkos::TeamThreadRange(team, thread_range),
-                  [&](const int t, double& lval) { lval += 1; }, team_result);
+                  [&](const int, double& lval) { lval += 1; }, team_result);
            }
            v1(idx) = team_result;
            // prevent compiler optimizing loop away
@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range,
            for (int tr = 0; tr < thread_repeat; ++tr) {
              Kokkos::parallel_reduce(
                  Kokkos::TeamThreadRange(team, thread_range),
-                  [&](const int t, double& lval) {
+                  [&](const int, double& lval) {
                    double vector_result = 0.0;
                    for (int vr = 0; vr < inner_repeat; ++vr) {
                      vector_result = 0.0;
                      Kokkos::parallel_reduce(
                          Kokkos::ThreadVectorRange(team, vector_range),
-                          [&](const int vi, double& vval) { vval += 1; },
+                          [&](const int, double& vval) { vval += 1; },
                          vector_result);
                      lval += vector_result;
                    }
--- a/Show More
+++ b/Show More