diff --git a/SECURITY.md b/SECURITY.md
index 9f65e2f88e..f06b781d11 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -23,6 +23,10 @@ either a user mistake or a bug in the code.  Bugs can be reported in
 the LAMMPS project
 [issue tracker on GitHub](https://github.com/lammps/lammps/issues).
 
+To mitigate issues with using homoglyphs or bidirectional reordering in
+unicode, which have been demonstrated as a vector to obfuscate and hide
+malicious changes to the source code, all LAMMPS submissions are checked
+for unicode characters and only all-ASCII source code is accepted.
 
 # Version Updates
 
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 101e0e13d3..b5f8db93d2 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -133,10 +133,7 @@ endif()
 set(LAMMPS_BINARY lmp${LAMMPS_MACHINE})
 
 option(BUILD_SHARED_LIBS "Build shared library" OFF)
-if(BUILD_SHARED_LIBS) # for all pkg libs, mpi_stubs and linalg
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-endif()
-
+option(CMAKE_POSITION_INDEPENDENT_CODE "Create object compatible with shared libraries" ON)
 option(BUILD_TOOLS "Build and install LAMMPS tools (msi2lmp, binary2txt, chain)" OFF)
 option(BUILD_LAMMPS_SHELL "Build and install the LAMMPS shell" OFF)
 
@@ -304,10 +301,12 @@ else()
       target_link_libraries(lmp PRIVATE mpi_stubs)
       target_include_directories(lmp INTERFACE $<BUILD_INTERFACE:${LAMMPS_SOURCE_DIR}/STUBS>)
       target_compile_definitions(lmp INTERFACE $<INSTALL_INTERFACE:LAMMPS_LIB_NO_MPI>)
-    endif(MSVC)
+    endif()
     target_include_directories(lammps INTERFACE $<BUILD_INTERFACE:${LAMMPS_SOURCE_DIR}/STUBS>)
     target_compile_definitions(lammps INTERFACE $<INSTALL_INTERFACE:LAMMPS_LIB_NO_MPI>)
   else()
+    target_include_directories(lammps INTERFACE $<BUILD_INTERFACE:${LAMMPS_SOURCE_DIR}/STUBS>)
+    target_compile_definitions(lammps INTERFACE $<INSTALL_INTERFACE:LAMMPS_LIB_NO_MPI>)
     target_link_libraries(lammps PUBLIC mpi_stubs)
   endif()
   add_library(MPI::MPI_CXX ALIAS mpi_stubs)
@@ -341,7 +340,6 @@ pkg_depends(ML-IAP ML-SNAP)
 pkg_depends(MPIIO MPI)
 pkg_depends(ATC MANYBODY)
 pkg_depends(LATBOLTZ MPI)
-pkg_depends(PHONON KSPACE)
 pkg_depends(SCAFACOS MPI)
 pkg_depends(DIELECTRIC KSPACE)
 pkg_depends(DIELECTRIC EXTRA-PAIR)
@@ -611,7 +609,7 @@ endif()
 # packages which selectively include variants based on enabled styles
 # e.g. accelerator packages
 ######################################################################
-foreach(PKG_WITH_INCL CORESHELL QEQ OPENMP DPD-SMOOTH KOKKOS OPT INTEL GPU)
+foreach(PKG_WITH_INCL CORESHELL DPD-SMOOTH PHONON QEQ OPENMP KOKKOS OPT INTEL GPU)
   if(PKG_${PKG_WITH_INCL})
     include(Packages/${PKG_WITH_INCL})
   endif()
@@ -810,11 +808,17 @@ if(ClangFormat_FOUND)
 endif()
 
 get_target_property(DEFINES lammps COMPILE_DEFINITIONS)
+get_property(BUILD_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+if(BUILD_IS_MULTI_CONFIG)
+  set(LAMMPS_BUILD_TYPE "Multi-Config")
+else()
+  set(LAMMPS_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+endif()
 include(FeatureSummary)
 feature_summary(DESCRIPTION "The following tools and libraries have been found and configured:" WHAT PACKAGES_FOUND)
 message(STATUS "<<< Build configuration >>>
    Operating System: ${CMAKE_SYSTEM_NAME} ${CMAKE_LINUX_DISTRO} ${CMAKE_DISTRO_VERSION}
-   Build type:       ${CMAKE_BUILD_TYPE}
+   Build type:       ${LAMMPS_BUILD_TYPE}
    Install path:     ${CMAKE_INSTALL_PREFIX}
    Generator:        ${CMAKE_GENERATOR} using ${CMAKE_MAKE_PROGRAM}")
 ###############################################################################
diff --git a/cmake/CMakeSettings.json b/cmake/CMakeSettings.json
index dada2f6752..ee4b3c46d5 100644
--- a/cmake/CMakeSettings.json
+++ b/cmake/CMakeSettings.json
@@ -1,55 +1,111 @@
 ﻿{
-    "configurations": [
+  "configurations": [
+    {
+      "name": "x64-Debug-MSVC",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "buildRoot": "${workspaceRoot}\\build\\${name}",
+      "installRoot": "${workspaceRoot}\\install\\${name}",
+      "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": [
         {
-            "name": "x64-Debug-MSVC",
-            "generator": "Ninja",
-            "configurationType": "Debug",
-            "buildRoot": "${workspaceRoot}\\build\\${name}",
-            "installRoot": "${workspaceRoot}\\install\\${name}",
-            "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake",
-            "buildCommandArgs": "",
-            "ctestCommandArgs": "",
-            "inheritEnvironments": [ "msvc_x64_x64" ],
-            "variables": [
-                {
-                    "name": "BUILD_SHARED_LIBS",
-                    "value": "True",
-                    "type": "BOOL"
-                },
-                {
-                    "name": "BUILD_TOOLS",
-                    "value": "True",
-                    "type": "BOOL"
-                },
-                {
-                    "name": "LAMMPS_EXCEPTIONS",
-                    "value": "True",
-                    "type": "BOOL"
-                }
-            ]
+          "name": "BUILD_SHARED_LIBS",
+          "value": "True",
+          "type": "BOOL"
         },
         {
-            "name": "x64-Debug-Clang",
-            "generator": "Ninja",
-            "configurationType": "Debug",
-            "buildRoot": "${workspaceRoot}\\build\\${name}",
-            "installRoot": "${workspaceRoot}\\install\\${name}",
-            "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake",
-            "buildCommandArgs": "",
-            "ctestCommandArgs": "",
-            "inheritEnvironments": [ "clang_cl_x64" ],
-            "variables": [
-                {
-                    "name": "BUILD_TOOLS",
-                    "value": "True",
-                    "type": "BOOL"
-                },
-                {
-                    "name": "LAMMPS_EXCEPTIONS",
-                    "value": "True",
-                    "type": "BOOL"
-                }
-            ]
+          "name": "BUILD_TOOLS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "LAMMPS_EXCEPTIONS",
+          "value": "True",
+          "type": "BOOL"
         }
-    ]
+      ]
+    },
+    {
+      "name": "x64-Debug-Clang",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "buildRoot": "${workspaceRoot}\\build\\${name}",
+      "installRoot": "${workspaceRoot}\\install\\${name}",
+      "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "clang_cl_x64" ],
+      "variables": [
+        {
+          "name": "BUILD_TOOLS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "LAMMPS_EXCEPTIONS",
+          "value": "True",
+          "type": "BOOL"
+        }
+      ]
+    },
+    {
+      "name": "x64-Debug-OneAPI",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "buildRoot": "${workspaceRoot}\\build\\${name}",
+      "installRoot": "${workspaceRoot}\\install\\${name}",
+      "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=on -DCMAKE_CXX_COMPILER=icx -DCMAKE_C_COMPILER=icx -DBUILD_MPI=off",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": [
+        {
+          "name": "BUILD_SHARED_LIBS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "BUILD_TOOLS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "LAMMPS_EXCEPTIONS",
+          "value": "True",
+          "type": "BOOL"
+        }
+      ]
+    },
+    {
+      "name": "x64-Debug-Intel",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "buildRoot": "${workspaceRoot}\\build\\${name}",
+      "installRoot": "${workspaceRoot}\\install\\${name}",
+      "cmakeCommandArgs": "-S ${workspaceRoot}\\cmake -C ${workspaceRoot}\\cmake\\presets\\windows.cmake -DENABLE_TESTING=off -DCMAKE_CXX_COMPILER=icl -DCMAKE_C_COMPILER=icl -DCMAKE_Fortran_COMPILER=ifort -DBUILD_MPI=off",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": "",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "variables": [
+        {
+          "name": "BUILD_SHARED_LIBS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "BUILD_TOOLS",
+          "value": "True",
+          "type": "BOOL"
+        },
+        {
+          "name": "LAMMPS_EXCEPTIONS",
+          "value": "True",
+          "type": "BOOL"
+        }
+      ]
+    }
+  ]
 }
\ No newline at end of file
diff --git a/cmake/Modules/ExternalCMakeProject.cmake b/cmake/Modules/ExternalCMakeProject.cmake
new file mode 100644
index 0000000000..855ce254c9
--- /dev/null
+++ b/cmake/Modules/ExternalCMakeProject.cmake
@@ -0,0 +1,33 @@
+# Build a CMake based external library as subdirectory.
+# The sources will be unpacked to ${CMAKE_BINARY_DIR}/_deps/${target}-src
+# The binaries will be built in ${CMAKE_BINARY_DIR}/_deps/${target}-build
+#
+function(ExternalCMakeProject target url hash basedir cmakedir cmakefile)
+  # change settings locally
+  set(BUILD_SHARED_LIBS OFF)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+  get_filename_component(archive ${url} NAME)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/src)
+  message(STATUS "Downloading ${url}")
+  file(DOWNLOAD ${url} ${CMAKE_BINARY_DIR}/_deps/${archive} EXPECTED_HASH MD5=${hash} SHOW_PROGRESS)
+  message(STATUS "Unpacking and configuring ${archive}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_BINARY_DIR}/_deps/${archive}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/src)
+  file(GLOB TARGET_SOURCE "${CMAKE_BINARY_DIR}/_deps/src/${basedir}*")
+  list(LENGTH TARGET_SOURCE _num)
+  if(_num GREATER 1)
+    message(FATAL_ERROR "Inconsistent ${target} library sources. "
+      "Please delete ${CMAKE_BINARY_DIR}/_deps/src and re-run cmake")
+  endif()
+  file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/_deps/${target}-src)
+  file(RENAME ${TARGET_SOURCE} ${CMAKE_BINARY_DIR}/_deps/${target}-src)
+  if(NOT (cmakefile STREQUAL ""))
+    file(COPY ${cmakefile} DESTINATION ${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/)
+    get_filename_component(_cmakefile ${cmakefile} NAME)
+    file(RENAME "${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/${_cmakefile}"
+      "${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}/CMakeLists.txt")
+  endif()
+  add_subdirectory("${CMAKE_BINARY_DIR}/_deps/${target}-src/${cmakedir}"
+    "${CMAKE_BINARY_DIR}/_deps/${target}-build")
+endfunction(ExternalCMakeProject)
diff --git a/cmake/Modules/GTest.cmake b/cmake/Modules/GTest.cmake
deleted file mode 100644
index e012e61ea9..0000000000
--- a/cmake/Modules/GTest.cmake
+++ /dev/null
@@ -1,81 +0,0 @@
-message(STATUS "Downloading and building Google Test library")
-
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  set(GTEST_LIB_POSTFIX d)
-else()
-  set(GTEST_LIB_POSTFIX)
-endif()
-
-include(ExternalProject)
-set(GTEST_URL "https://github.com/google/googletest/archive/release-1.11.0.tar.gz" CACHE STRING "URL of googletest source")
-set(GTEST_MD5 "e8a8df240b6938bb6384155d4c37d937" CACHE STRING "MD5 sum for googletest source")
-mark_as_advanced(GTEST_URL)
-mark_as_advanced(GTEST_MD5)
-ExternalProject_Add(googletest
-                    URL             ${GTEST_URL}
-                    URL_MD5         ${GTEST_MD5}
-                    SOURCE_DIR      "${CMAKE_BINARY_DIR}/gtest-src"
-                    BINARY_DIR      "${CMAKE_BINARY_DIR}/gtest-build"
-                    CMAKE_ARGS      ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_GTEST_OPTS}
-                                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                                    -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-                                    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                                    -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-                                    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
-                                     <BINARY_DIR>/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
-                                     <BINARY_DIR>/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
-                                     <BINARY_DIR>/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
-                    LOG_DOWNLOAD ON
-                    LOG_CONFIGURE ON
-                    LOG_BUILD ON
-                    INSTALL_COMMAND ""
-                    TEST_COMMAND    "")
-
-ExternalProject_Get_Property(googletest SOURCE_DIR)
-set(GTEST_INCLUDE_DIR ${SOURCE_DIR}/googletest/include)
-set(GMOCK_INCLUDE_DIR ${SOURCE_DIR}/googlemock/include)
-
-# workaround for CMake 3.10 on ubuntu 18.04
-file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIR})
-file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIR})
-
-ExternalProject_Get_Property(googletest BINARY_DIR)
-set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
-set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
-set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
-set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
-
-# Prevent GoogleTest from overriding our compiler/linker options
-# when building with Visual Studio
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-find_package(Threads QUIET)
-
-add_library(GTest::GTest UNKNOWN IMPORTED)
-set_target_properties(GTest::GTest PROPERTIES
-        IMPORTED_LOCATION ${GTEST_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIR}
-        INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
-add_dependencies(GTest::GTest googletest)
-
-add_library(GTest::GMock UNKNOWN IMPORTED)
-set_target_properties(GTest::GMock PROPERTIES
-        IMPORTED_LOCATION ${GMOCK_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIR}
-        INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
-add_dependencies(GTest::GMock googletest)
-
-add_library(GTest::GTestMain UNKNOWN IMPORTED)
-set_target_properties(GTest::GTestMain PROPERTIES
-        IMPORTED_LOCATION ${GTEST_MAIN_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIR}
-        INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
-add_dependencies(GTest::GTestMain googletest)
-
-add_library(GTest::GMockMain UNKNOWN IMPORTED)
-set_target_properties(GTest::GMockMain PROPERTIES
-        IMPORTED_LOCATION ${GMOCK_MAIN_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIR}
-        INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
-add_dependencies(GTest::GMockMain googletest)
diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake
index 28ad99fa31..943c3d851e 100644
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@@ -25,7 +25,7 @@ function(validate_option name values)
 endfunction(validate_option)
 
 function(get_lammps_version version_header variable)
-    file(READ ${version_header} line)
+    file(STRINGS ${version_header} line REGEX LAMMPS_VERSION)
     set(MONTHS x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)
     string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\1" day "${line}")
     string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\2" month "${line}")
diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index a57715d294..048c0ed473 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -306,12 +306,12 @@ elseif(GPU_API STREQUAL "HIP")
 
         if(HIP_COMPILER STREQUAL "clang")
           add_custom_command(OUTPUT ${CUBIN_FILE}
-            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE}
+            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE}
             DEPENDS ${CU_CPP_FILE}
             COMMENT "Generating ${CU_NAME}.cubin")
         else()
           add_custom_command(OUTPUT ${CUBIN_FILE}
-            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
+            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
             DEPENDS ${CU_CPP_FILE}
             COMMENT "Generating ${CU_NAME}.cubin")
         endif()
diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index fe6c17801e..25211268e9 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -39,8 +39,8 @@ if(DOWNLOAD_KOKKOS)
   list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
   list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
   include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "4c84698917c93a18985b311bb6caf84f" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.5.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "079323d973ae0e1c38c0a54a150c674e" CACHE STRING "MD5 checksum of KOKKOS tarball")
   mark_as_advanced(KOKKOS_URL)
   mark_as_advanced(KOKKOS_MD5)
   ExternalProject_Add(kokkos_build
@@ -60,7 +60,7 @@ if(DOWNLOAD_KOKKOS)
   target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS)
   add_dependencies(LAMMPS::KOKKOS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.4.01 REQUIRED CONFIG)
+  find_package(Kokkos 3.5.00 REQUIRED CONFIG)
   target_link_libraries(lammps PRIVATE Kokkos::kokkos)
   target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
diff --git a/cmake/Modules/Packages/ML-QUIP.cmake b/cmake/Modules/Packages/ML-QUIP.cmake
index 92418e8939..947c555842 100644
--- a/cmake/Modules/Packages/ML-QUIP.cmake
+++ b/cmake/Modules/Packages/ML-QUIP.cmake
@@ -32,7 +32,8 @@ if(DOWNLOAD_QUIP)
   foreach(flag ${LAPACK_LIBRARIES})
     set(temp "${temp} ${flag}")
   endforeach()
-  set(temp "${temp}\n")
+  # Fix cmake crashing when MATH_LINKOPTS not set, required for e.g. recent Cray Programming Environment
+  set(temp "${temp} -L/_DUMMY_PATH_\n")
   set(temp "${temp}PYTHON=python\nPIP=pip\nEXTRA_LINKOPTS=\n")
   set(temp "${temp}HAVE_CP2K=0\nHAVE_VASP=0\nHAVE_TB=0\nHAVE_PRECON=1\nHAVE_LOTF=0\nHAVE_ONIOM=0\n")
   set(temp "${temp}HAVE_LOCAL_E_MIX=0\nHAVE_QC=0\nHAVE_GAP=1\nHAVE_DESCRIPTORS_NONCOMMERCIAL=1\n")
diff --git a/cmake/Modules/Packages/MSCG.cmake b/cmake/Modules/Packages/MSCG.cmake
index cf3d506c82..e4260e059e 100644
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@@ -12,41 +12,12 @@ if(DOWNLOAD_MSCG)
   mark_as_advanced(MSCG_URL)
   mark_as_advanced(MSCG_MD5)
 
-  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
-  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
-  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
-  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
-    message(FATAL_ERROR "Cannot compile downloaded MSCG library due to a technical limitation")
-  endif()
+  include(ExternalCMakeProject)
+  ExternalCMakeProject(mscg ${MSCG_URL} ${MSCG_MD5} MSCG-release src/CMake "")
 
-  include(ExternalProject)
-  ExternalProject_Add(mscg_build
-    URL     ${MSCG_URL}
-    URL_MD5 ${MSCG_MD5}
-    SOURCE_SUBDIR src/CMake
-    CMAKE_ARGS ${CMAKE_REQUEST_PIC} ${EXTRA_MSCG_OPTS}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
-               -DBLAS_LIBRARIES=${BLAS_LIBRARIES} -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES}
-               -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-               -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-               -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-               -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --target mscg
-    INSTALL_COMMAND ""
-    BUILD_BYPRODUCTS <BINARY_DIR>/libmscg.a
-    )
-  ExternalProject_get_property(mscg_build BINARY_DIR)
-  ExternalProject_get_property(mscg_build SOURCE_DIR)
-  file(MAKE_DIRECTORY ${SOURCE_DIR}/src)
-  add_library(LAMMPS::MSCG UNKNOWN IMPORTED)
-  set_target_properties(LAMMPS::MSCG PROPERTIES
-    IMPORTED_LOCATION "${BINARY_DIR}/libmscg.a"
-    INTERFACE_INCLUDE_DIRECTORIES "${SOURCE_DIR}/src"
-    INTERFACE_LINK_LIBRARIES "${LAPACK_LIBRARIES}")
-  target_link_libraries(lammps PRIVATE LAMMPS::MSCG)
-  add_dependencies(LAMMPS::MSCG mscg_build)
+  # set include and link library
+  target_include_directories(lammps PRIVATE "${CMAKE_BINARY_DIR}/_deps/mscg-src/src")
+  target_link_libraries(lammps PRIVATE mscg)
 else()
   find_package(MSCG)
   if(NOT MSCG_FOUND)
diff --git a/cmake/Modules/Packages/PHONON.cmake b/cmake/Modules/Packages/PHONON.cmake
new file mode 100644
index 0000000000..3021868f68
--- /dev/null
+++ b/cmake/Modules/Packages/PHONON.cmake
@@ -0,0 +1,9 @@
+# fix phonon may only be installed if also the FFT wrappers from KSPACE are installed
+if(NOT PKG_KSPACE)
+  get_property(LAMMPS_FIX_HEADERS GLOBAL PROPERTY FIX)
+  list(REMOVE_ITEM LAMMPS_FIX_HEADERS ${LAMMPS_SOURCE_DIR}/PHONON/fix_phonon.h)
+  set_property(GLOBAL PROPERTY FIX "${LAMMPS_FIX_HEADERS}")
+  get_target_property(LAMMPS_SOURCES lammps SOURCES)
+  list(REMOVE_ITEM LAMMPS_SOURCES ${LAMMPS_SOURCE_DIR}/PHONON/fix_phonon.cpp)
+  set_property(TARGET lammps PROPERTY SOURCES "${LAMMPS_SOURCES}")
+endif()
diff --git a/cmake/Modules/Packages/PLUMED.cmake b/cmake/Modules/Packages/PLUMED.cmake
index 0f063f3e14..6b832574ca 100644
--- a/cmake/Modules/Packages/PLUMED.cmake
+++ b/cmake/Modules/Packages/PLUMED.cmake
@@ -54,8 +54,8 @@ if(DOWNLOAD_PLUMED)
     set(PLUMED_BUILD_BYPRODUCTS "<INSTALL_DIR>/lib/libplumedWrapper.a")
   endif()
 
-  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz" CACHE STRING "URL for PLUMED tarball")
-  set(PLUMED_MD5 "cfa0b4dd90a81c25d3302e8d97bfeaea" CACHE STRING "MD5 checksum of PLUMED tarball")
+  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.3/plumed-src-2.7.3.tgz" CACHE STRING "URL for PLUMED tarball")
+  set(PLUMED_MD5 "f00cc82edfefe6bb3df934911dbe32fb" CACHE STRING "MD5 checksum of PLUMED tarball")
 
   mark_as_advanced(PLUMED_URL)
   mark_as_advanced(PLUMED_MD5)
diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake
deleted file mode 100644
index 77ee804111..0000000000
--- a/cmake/Modules/YAML.cmake
+++ /dev/null
@@ -1,47 +0,0 @@
-message(STATUS "Downloading and building YAML library")
-
-include(ExternalProject)
-set(YAML_URL "https://pyyaml.org/download/libyaml/yaml-0.2.5.tar.gz" CACHE STRING "URL for libyaml tarball")
-set(YAML_MD5 "bb15429d8fb787e7d3f1c83ae129a999" CACHE STRING "MD5 checksum of libyaml tarball")
-mark_as_advanced(YAML_URL)
-mark_as_advanced(YAML_MD5)
-
-# support cross-compilation to windows
-if(CMAKE_CROSSCOMPILING AND (CMAKE_SYSTEM_NAME STREQUAL "Windows"))
-  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86")
-    set(YAML_CROSS_HOST --host=i686-mingw64)
-  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-    set(YAML_CROSS_HOST --host=x86_64-mingw64)
-  else()
-    message(FATAL_ERROR "Unsupported cross-compilation "
-      " for ${CMAKE_SYSTEM_NAME}/${CMAKE_SYSTEM_PROCESSOR}"
-      " on ${CMAKE_HOST_SYSTEM}/${CMAKE_HOST_SYSTEM_PROCESSOR}")
-  endif()
-endif()
-
-ExternalProject_Add(libyaml
-                    URL               ${YAML_URL}
-                    URL_MD5           ${YAML_MD5}
-                    SOURCE_DIR        "${CMAKE_BINARY_DIR}/yaml-src"
-                    BINARY_DIR        "${CMAKE_BINARY_DIR}/yaml-build"
-                    CONFIGURE_COMMAND <SOURCE_DIR>/configure ${CONFIGURE_REQUEST_PIC}
-                                      CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER}
-                                      --prefix=<INSTALL_DIR> --disable-shared ${YAML_CROSS_HOST}
-                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                    TEST_COMMAND      "")
-
-ExternalProject_Get_Property(libyaml INSTALL_DIR)
-set(YAML_INCLUDE_DIR ${INSTALL_DIR}/include)
-set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib)
-
-# workaround for CMake 3.10 on ubuntu 18.04
-file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR})
-file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR})
-
-set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX})
-
-add_library(Yaml::Yaml UNKNOWN IMPORTED)
-set_target_properties(Yaml::Yaml PROPERTIES
-        IMPORTED_LOCATION ${YAML_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${YAML_INCLUDE_DIR})
-add_dependencies(Yaml::Yaml libyaml)
diff --git a/cmake/presets/most.cmake b/cmake/presets/most.cmake
index eb26b38928..27ce57621c 100644
--- a/cmake/presets/most.cmake
+++ b/cmake/presets/most.cmake
@@ -48,7 +48,6 @@ set(ALL_PACKAGES
   PHONON
   PLUGIN
   POEMS
-  PYTHON
   QEQ
   REACTION
   REAXFF
diff --git a/doc/Makefile b/doc/Makefile
index d61f844a1b..a082018dfb 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -230,7 +230,7 @@ $(VENV):
 	)
 
 $(MATHJAX):
-	@git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 git://github.com/mathjax/MathJax.git $@
+	@git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 https://github.com/mathjax/MathJax.git $@
 
 $(ANCHORCHECK): $(VENV)
 	@( \
diff --git a/doc/lammps.1 b/doc/lammps.1
index 78b6c9fd67..58086b1fae 100644
--- a/doc/lammps.1
+++ b/doc/lammps.1
@@ -1,4 +1,4 @@
-.TH LAMMPS "1" "27 October 2021" "2021-10-27"
+.TH LAMMPS "1" "7 January 2022" "2022-1-7"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
diff --git a/doc/src/Bibliography.rst b/doc/src/Bibliography.rst
index 0256552332..9f3591dcde 100644
--- a/doc/src/Bibliography.rst
+++ b/doc/src/Bibliography.rst
@@ -1123,9 +1123,12 @@ Bibliography
 **(Sun)**
    Sun, J. Phys. Chem. B, 102, 7338-7364 (1998).
 
-**(Surblys)**
+**(Surblys2019)**
    Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019).
 
+**(Surblys2021)**
+   Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021).
+
 **(Sutmann)**
    Sutmann, Arnold, Fahrenberger, et. al., Physical review / E 88(6), 063308 (2013)
 
diff --git a/doc/src/Build_cmake.rst b/doc/src/Build_cmake.rst
index 2a64bc3240..9bee18146c 100644
--- a/doc/src/Build_cmake.rst
+++ b/doc/src/Build_cmake.rst
@@ -150,6 +150,42 @@ for IDEs like Eclipse, CodeBlocks, or Kate can be selected using the *-G*
 command line flag.  A list of available generator settings for your
 specific CMake version is given when running ``cmake --help``.
 
+.. _cmake_multiconfig:
+
+Multi-configuration build systems
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Throughout this manual it is mostly assumed that LAMMPS is being built
+on a Unix-like operating system with "make" as the underlying "builder",
+since this is the most common case.  In this case the build "configuration"
+is chose using ``-D CMAKE_BUILD_TYPE=<configuration>`` with ``<configuration>``
+being one of "Release", "Debug", "RelWithDebInfo", or "MinSizeRel".
+Some build tools, however, can also use or even require to have a so-called
+multi-configuration build system setup.  For those the built type (or
+configuration) is chosen at compile time using the same build files. E.g.
+with:
+
+.. code-block:: bash
+
+   cmake --build build-multi --config Release
+
+In that case the resulting binaries are not in the build folder directly
+but in sub-directories corresponding to the build type (i.e. Release in
+the example from above).  Similarly, for running unit tests the
+configuration is selected with the *-C* flag:
+
+.. code-block:: bash
+
+   ctest -C Debug
+
+The CMake scripts in LAMMPS have basic support for being compiled using a
+multi-config build system, but not all of it has been ported.  This is in
+particular applicable to compiling packages that require additional libraries
+that would be downloaded and compiled by CMake.  The "windows" preset file
+tries to keep track of which packages can be compiled natively with the
+MSVC compilers out-of-the box.  Not all of those external libraries are
+portable to Windows either.
+
 
 Installing CMake
 ^^^^^^^^^^^^^^^^
diff --git a/doc/src/Build_development.rst b/doc/src/Build_development.rst
index 3c2acbaa7e..5492a1e536 100644
--- a/doc/src/Build_development.rst
+++ b/doc/src/Build_development.rst
@@ -185,6 +185,10 @@ The ``ctest`` command has many options, the most important ones are:
      - run subset of tests matching the regular expression <regex>
    * - -E <regex>
      - exclude subset of tests matching the regular expression <regex>
+   * - -L <regex>
+     - run subset of tests with a label matching the regular expression <regex>
+   * - -LE <regex>
+     - exclude subset of tests with a label matching the regular expression <regex>
    * - -N
      - dry-run: display list of tests without running them
    * - -T memcheck
@@ -299,6 +303,12 @@ will destroy the original file, if the generation run does not complete,
 so using *-g* is recommended unless the YAML file is fully tested
 and working.
 
+Some of the force style tests are rather slow to run and some are very
+sensitive to small differences like CPU architecture, compiler
+toolchain, compiler optimization. Those tests are flagged with a "slow"
+and/or "unstable" label, and thus those tests can be selectively
+excluded with the ``-LE`` flag or selected with the ``-L`` flag.
+
 .. admonition:: Recommendations and notes for YAML files
    :class: note
 
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 2157fe86c8..9648df402f 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -341,6 +341,18 @@ minutes to hours) to build.  Of course you only need to do that once.)
          $ make lib-kim args="-p /usr/local" # use an existing KIM API installation at the provided location
          $ make lib-kim args="-p /usr/local -a EAM_Dynamo_Ackland_W__MO_141627196590_002" # ditto but add one model or driver
 
+      When using the "-b " option, the KIM library is built using its native
+      cmake build system.  The ``lib/kim/Install.py`` script supports a
+      ``CMAKE`` environment variable if the cmake executable is named other
+      than ``cmake`` on your system.  Additional environment variables may be
+      provided on the command line for use by cmake.  For example, to use the
+      ``cmake3`` executable and tell it to use the gnu version 11 compilers
+      to build KIM, one could use the following command line.
+
+      .. code-block:: bash
+
+         $ CMAKE=cmake3 CXX=g++-11 CC=gcc-11 FC=gfortran-11 make lib-kim args="-b "  # (re-)install KIM API lib using cmake3 and gnu v11 compilers with only example models
+
       Settings for debugging OpenKIM web queries discussed below need to
       be applied by adding them to the ``LMP_INC`` variable through
       editing the ``Makefile.machine`` you are using.  For example:
@@ -560,11 +572,26 @@ They must be specified in uppercase.
    *  - VEGA908
       - GPU
       - AMD GPU MI100 GFX908
-   *  - INTEL_GEN
+   *  - VEGA90A
       - GPU
-      - Intel GPUs Gen9+
+      - AMD GPU
+   *  - INTEL_DG1
+      - GPU
+      - Intel Iris XeMAX GPU
+   *  - INTEL_GEN9
+      - GPU
+      - Intel GPU Gen9
+   *  - INTEL_GEN11
+      - GPU
+      - Intel GPU Gen11
+   *  - INTEL_GEN12LP
+      - GPU
+      - Intel GPU Gen12LP
+   *  - INTEL_XEHP
+      - GPU
+      - Intel GPUs Xe-HP
 
-This list was last updated for version 3.4.1 of the Kokkos library.
+This list was last updated for version 3.5.0 of the Kokkos library.
 
 .. tabs::
 
diff --git a/doc/src/Build_windows.rst b/doc/src/Build_windows.rst
index fa2296d302..4bb5cfec27 100644
--- a/doc/src/Build_windows.rst
+++ b/doc/src/Build_windows.rst
@@ -89,6 +89,11 @@ miss the correct master ``CMakeLists.txt``.  Try to open the
 starting point.  It is also possible to configure and compile LAMMPS
 from the command line with a CMake binary from `cmake.org <https://cmake.org>`_.
 
+Please note, that for either approach CMake will create a so-called
+:ref:`"multi-configuration" build environment <cmake_multiconfig>`, and
+the command lines for building and testing LAMMPS must be adjusted
+accordingly.
+
 To support running in parallel you can compile with OpenMP enabled using
 the OPENMP package or install Microsoft MPI (including the SDK) and compile
 LAMMPS with MPI enabled.
diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst
index c28c9db864..40b99a5fb8 100644
--- a/doc/src/Commands_bond.rst
+++ b/doc/src/Commands_bond.rst
@@ -37,6 +37,7 @@ OPT.
    * :doc:`class2 (ko) <bond_class2>`
    * :doc:`fene (iko) <bond_fene>`
    * :doc:`fene/expand (o) <bond_fene_expand>`
+   * :doc:`fene/nm <bond_fene>`
    * :doc:`gaussian <bond_gaussian>`
    * :doc:`gromos (o) <bond_gromos>`
    * :doc:`harmonic (iko) <bond_harmonic>`
diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst
index 0c60883314..f1f0597f30 100644
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@@ -28,6 +28,7 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`angle <compute_angle>`
    * :doc:`angle/local <compute_angle_local>`
    * :doc:`angmom/chunk <compute_angmom_chunk>`
+   * :doc:`ave/sphere/atom (k) <compute_ave_sphere_atom>`
    * :doc:`basal/atom <compute_basal_atom>`
    * :doc:`body/local <compute_body_local>`
    * :doc:`bond <compute_bond>`
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index 7cf4e7635b..9ac4fc851c 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -210,6 +210,7 @@ OPT.
    * :doc:`nm/cut (o) <pair_nm>`
    * :doc:`nm/cut/coul/cut (o) <pair_nm>`
    * :doc:`nm/cut/coul/long (o) <pair_nm>`
+   * :doc:`nm/cut/split <pair_nm>`
    * :doc:`oxdna/coaxstk <pair_oxdna>`
    * :doc:`oxdna/excv <pair_oxdna>`
    * :doc:`oxdna/hbond <pair_oxdna>`
@@ -262,6 +263,7 @@ OPT.
    * :doc:`spin/neel <pair_spin_neel>`
    * :doc:`srp <pair_srp>`
    * :doc:`sw (giko) <pair_sw>`
+   * :doc:`sw/mod (o) <pair_sw>`
    * :doc:`table (gko) <pair_table>`
    * :doc:`table/rx (k) <pair_table_rx>`
    * :doc:`tdpd <pair_mesodpd>`
diff --git a/doc/src/Developer_platform.rst b/doc/src/Developer_platform.rst
index c9ecd30cec..cdc4bb6770 100644
--- a/doc/src/Developer_platform.rst
+++ b/doc/src/Developer_platform.rst
@@ -118,6 +118,9 @@ Environment variable functions
 .. doxygenfunction:: putenv
    :project: progguide
 
+.. doxygenfunction:: unsetenv
+   :project: progguide
+
 .. doxygenfunction:: list_pathenv
    :project: progguide
 
diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst
index db47a9e3c3..a9969b7543 100644
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@@ -56,11 +56,11 @@ String to number conversions with validity check
 
 These functions should be used to convert strings to numbers. They are
 are strongly preferred over C library calls like ``atoi()`` or
-``atof()`` since they check if the **entire** provided string is a valid
+``atof()`` since they check if the **entire** string is a valid
 (floating-point or integer) number, and will error out instead of
 silently returning the result of a partial conversion or zero in cases
-where the string is not a valid number.  This behavior allows to more
-easily detect typos or issues when processing input files.
+where the string is not a valid number.  This behavior improves
+detecting typos or issues when processing input files.
 
 Similarly the :cpp:func:`logical() <LAMMPS_NS::utils::logical>` function
 will convert a string into a boolean and will only accept certain words.
@@ -76,19 +76,34 @@ strings for compliance without conversion.
 
 ----------
 
-.. doxygenfunction:: numeric
+.. doxygenfunction:: numeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
    :project: progguide
 
-.. doxygenfunction:: inumeric
+.. doxygenfunction:: numeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
    :project: progguide
 
-.. doxygenfunction:: bnumeric
+.. doxygenfunction:: inumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
    :project: progguide
 
-.. doxygenfunction:: tnumeric
+.. doxygenfunction:: inumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
    :project: progguide
 
-.. doxygenfunction:: logical
+.. doxygenfunction:: bnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
+   :project: progguide
+
+.. doxygenfunction:: bnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+   :project: progguide
+
+.. doxygenfunction:: tnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
+   :project: progguide
+
+.. doxygenfunction:: tnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+   :project: progguide
+
+.. doxygenfunction:: logical(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
+   :project: progguide
+
+.. doxygenfunction:: logical(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
    :project: progguide
 
 
diff --git a/doc/src/Developer_write.rst b/doc/src/Developer_write.rst
index c3df6ad6bb..bdc6559060 100644
--- a/doc/src/Developer_write.rst
+++ b/doc/src/Developer_write.rst
@@ -55,7 +55,7 @@ of each timestep. First of all, implement a constructor:
      if (narg < 4)
        error->all(FLERR,"Illegal fix print/vel command");
 
-     nevery = force->inumeric(FLERR,arg[3]);
+     nevery = utils::inumeric(FLERR,arg[3],false,lmp);
      if (nevery <= 0)
        error->all(FLERR,"Illegal fix print/vel command");
    }
diff --git a/doc/src/Errors_messages.rst b/doc/src/Errors_messages.rst
index 3a593b5a3f..c06f4c86e3 100644
--- a/doc/src/Errors_messages.rst
+++ b/doc/src/Errors_messages.rst
@@ -7772,9 +7772,6 @@ keyword to allow for additional bonds to be formed
    The system size must fit in a 32-bit integer to use this dump
    style.
 
-*Too many atoms to dump sort*
-   Cannot sort when running with more than 2\^31 atoms.
-
 *Too many elements extracted from MEAM library.*
    Increase 'maxelt' in meam.h and recompile.
 
diff --git a/doc/src/Howto_drude2.rst b/doc/src/Howto_drude2.rst
index 589e9d7b9a..00289a989a 100644
--- a/doc/src/Howto_drude2.rst
+++ b/doc/src/Howto_drude2.rst
@@ -491,11 +491,6 @@ NPT ensemble using Nose-Hoover thermostat:
 **(Schroeder)**  Schroeder and Steinhauser, J Chem Phys, 133,
 154511 (2010).
 
-.. _Jiang2:
-
-**(Jiang)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux,
- J Phys Chem Lett, 2, 87-92 (2011).
-
 .. _Thole2:
 
 **(Thole)** Chem Phys, 59, 341 (1981).
diff --git a/doc/src/Howto_github.rst b/doc/src/Howto_github.rst
index 278b9e4bfd..315bacac69 100644
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@@ -141,7 +141,8 @@ unrelated feature, you should switch branches!
    Committing changes to the *develop*, *release*, or *stable* branches
    is strongly discouraged.  While it may be convenient initially, it
    will create more work in the long run.  Various texts and tutorials
-   on using git effectively discuss the motivation for this.
+   on using git effectively discuss the motivation for using feature
+   branches instead.
 
 **After changes are made**
 
diff --git a/doc/src/Install_git.rst b/doc/src/Install_git.rst
index 4e7db77873..a5dc19fe79 100644
--- a/doc/src/Install_git.rst
+++ b/doc/src/Install_git.rst
@@ -28,8 +28,9 @@ provides `limited support for subversion clients <svn_>`_.
 
 You can follow the LAMMPS development on 3 different git branches:
 
-* **stable**   :  this branch is updated with every stable release;
-  updates are always "fast forward" merges from *develop*
+* **stable**   :  this branch is updated from the *release* branch with
+  every stable release version and also has selected bug fixes and updates
+  back-ported from the *develop* branch
 * **release**  :  this branch is updated with every patch release;
   updates are always "fast forward" merges from *develop*
 * **develop**  :  this branch follows the ongoing development and
@@ -47,20 +48,22 @@ your machine and "release" is one of the 3 branches listed above.
 (Note that you actually download all 3 branches; you can switch
 between them at any time using "git checkout <branch name>".)
 
-.. note::
+.. admonition:: Saving time and disk space when using ``git clone``
 
    The complete git history of the LAMMPS project is quite large because
    it contains the entire commit history of the project since fall 2006,
-   which includes the time when LAMMPS was managed with subversion. This
-   also includes commits that have added and removed some large files
-   (mostly by accident).  If you do not need access to the entire commit
-   history, you can speed up the "cloning" process and reduce local disk
-   space requirements by using the *--depth* git command line flag thus
-   create a "shallow clone" of the repository that contains only a
-   subset of the git history. Using a depth of 1000 is usually sufficient
-   to include the head commits of the *develop* and the *release* branches.
-   To include the head commit of the *stable* branch you may need a depth
-   of up to 10000.
+   which includes the time when LAMMPS was managed with subversion.
+   This includes a few commits that have added and removed some large
+   files (mostly by accident).  If you do not need access to the entire
+   commit history (most people don't), you can speed up the "cloning"
+   process and reduce local disk space requirements by using the
+   *--depth* git command line flag.  That will create a "shallow clone"
+   of the repository containing only a subset of the git history.  Using
+   a depth of 1000 is usually sufficient to include the head commits of
+   the *develop* and the *release* branches.  To include the head commit
+   of the *stable* branch you may need a depth of up to 10000.  If you
+   later need more of the git history, you can always convert the
+   shallow clone into a "full clone".
 
 Once the command completes, your directory will contain the same files
 as if you unpacked a current LAMMPS tarball, with the exception, that
@@ -156,9 +159,9 @@ changed.  How to do this depends on the build system you are using.
 .. admonition:: Git protocols
    :class: note
 
-   The servers at github.com support the "git://" and "https://" access
-   protocols for anonymous, read-only access.  If you have a suitably
-   configured GitHub account, you may also use SSH protocol with the
+   The servers at github.com support the "https://" access protocol for
+   anonymous, read-only access.  If you have a suitably configured GitHub
+   account, you may also use SSH protocol with the
    URL "git@github.com:lammps/lammps.git".
 
 The LAMMPS GitHub project is currently managed by Axel Kohlmeyer
diff --git a/doc/src/Intro_citing.rst b/doc/src/Intro_citing.rst
index 0e10b7559a..08f82fac33 100644
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@@ -16,7 +16,7 @@ source code design, the program structure, the spatial decomposition
 approach, the neighbor finding, basic communications algorithms, and how
 users and developers have contributed to LAMMPS is:
 
-  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 <https://doi.org/10.1016/j.cpc.2021.108171>`_
+  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. 271, 108171 (2022) <https://doi.org/10.1016/j.cpc.2021.108171>`_
 
 So a project using LAMMPS or a derivative application that uses LAMMPS
 as a simulation engine should cite this paper.  The paper is expected to
diff --git a/doc/src/Manual_version.rst b/doc/src/Manual_version.rst
index b705ce8c4a..78ed61cd7c 100644
--- a/doc/src/Manual_version.rst
+++ b/doc/src/Manual_version.rst
@@ -10,23 +10,31 @@ Whenever we fix a bug or update or add a feature, it will be merged into
 the *develop* branch of the git repository.  When a sufficient number of
 changes have accumulated *and* the software passes a set of automated
 tests, we release it in the next *patch* release, which are made every
-few weeks.  Info on patch releases are on `this website page
+few weeks.  The *release* branch of the git repository is updated with
+every such release.  Info on patch releases are on `this website page
 <https://www.lammps.org/bug.html>`_.
 
-Once or twice a year, only bug fixes and small, non-intrusive changes are
-included for a period of time, and the code is subjected to more detailed
+Once or twice a year, we apply only bug fixes and small, non-intrusive
+changes to the *develop* branch and the code is subjected to more detailed
 and thorough testing than the default automated testing.  The latest
-patch release after such a period is then labeled as a *stable* version.
+patch release after such a period is then also labeled as a *stable* version
+and the *stable* branch is updated with it.  Between stable releases
+we occasionally release some updates to the stable release containing
+only bug fixes and updates back-ported from *develop* but no new features
+and update the *stable* branch accordingly.
 
-Each version of LAMMPS contains all the features and bug-fixes up to
-and including its version date.
+Each version of LAMMPS contains all the documented features up to and
+including its version date.
 
 The version date is printed to the screen and logfile every time you
 run LAMMPS. It is also in the file src/version.h and in the LAMMPS
 directory name created when you unpack a tarball.  And it is on the
 first page of the :doc:`manual <Manual>`.
 
-* If you browse the HTML pages on the LAMMPS WWW site, they always
-  describe the most current patch release of LAMMPS.
+* If you browse the HTML pages on the LAMMPS WWW site, they will by
+  default describe the most current patch release version of LAMMPS.
+  In the navigation bar on the bottom left, there is the option to
+  view instead the documentation for the most recent *stable* version
+  or the latest version from the current development branch.
 * If you browse the HTML pages included in your tarball, they
   describe the version you have, which may be older.
diff --git a/doc/src/Modify_pair.rst b/doc/src/Modify_pair.rst
index 7263b8fd48..6913204504 100644
--- a/doc/src/Modify_pair.rst
+++ b/doc/src/Modify_pair.rst
@@ -12,24 +12,24 @@ includes some optional methods to enable its use with rRESPA.
 
 Here is a brief description of the class methods in pair.h:
 
-+---------------------------------+-------------------------------------------------------------------+
-| compute                         | workhorse routine that computes pairwise interactions             |
-+---------------------------------+-------------------------------------------------------------------+
-| settings                        | reads the input script line with arguments you define             |
-+---------------------------------+-------------------------------------------------------------------+
-| coeff                           | set coefficients for one i,j type pair                            |
-+---------------------------------+-------------------------------------------------------------------+
-| init_one                        | perform initialization for one i,j type pair                      |
-+---------------------------------+-------------------------------------------------------------------+
-| init_style                      | initialization specific to this pair style                        |
-+---------------------------------+-------------------------------------------------------------------+
-| write & read_restart            | write/read i,j pair coeffs to restart files                       |
-+---------------------------------+-------------------------------------------------------------------+
-| write & read_restart_settings   | write/read global settings to restart files                       |
-+---------------------------------+-------------------------------------------------------------------+
-| single                          | force and energy of a single pairwise interaction between 2 atoms |
-+---------------------------------+-------------------------------------------------------------------+
-| compute_inner/middle/outer      | versions of compute used by rRESPA                                |
-+---------------------------------+-------------------------------------------------------------------+
++---------------------------------+---------------------------------------------------------------------+
+| compute                         | workhorse routine that computes pairwise interactions               |
++---------------------------------+---------------------------------------------------------------------+
+| settings                        | reads the input script line with arguments you define               |
++---------------------------------+---------------------------------------------------------------------+
+| coeff                           | set coefficients for one i,j type pair                              |
++---------------------------------+---------------------------------------------------------------------+
+| init_one                        | perform initialization for one i,j type pair                        |
++---------------------------------+---------------------------------------------------------------------+
+| init_style                      | initialization specific to this pair style                          |
++---------------------------------+---------------------------------------------------------------------+
+| write & read_restart            | write/read i,j pair coeffs to restart files                         |
++---------------------------------+---------------------------------------------------------------------+
+| write & read_restart_settings   | write/read global settings to restart files                         |
++---------------------------------+---------------------------------------------------------------------+
+| single                          | force/r and energy of a single pairwise interaction between 2 atoms |
++---------------------------------+---------------------------------------------------------------------+
+| compute_inner/middle/outer      | versions of compute used by rRESPA                                  |
++---------------------------------+---------------------------------------------------------------------+
 
 The inner/middle/outer routines are optional.
diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst
index e5e548a341..eb5cae0443 100644
--- a/doc/src/Packages_details.rst
+++ b/doc/src/Packages_details.rst
@@ -1907,6 +1907,12 @@ MPIIO library.  It adds :doc:`dump styles <dump>` with a "mpiio" in
 their style name.  Restart files with an ".mpiio" suffix are also
 written and read in parallel.
 
+.. warning::
+
+   The MPIIO package is currently unmaintained and has become
+   unreliable. Use with caution.
+
+
 **Install:**
 
 The MPIIO package requires that LAMMPS is build in :ref:`MPI parallel mode <serial>`.
diff --git a/doc/src/angle_class2.rst b/doc/src/angle_class2.rst
index f257d96dc3..4e8e515564 100644
--- a/doc/src/angle_class2.rst
+++ b/doc/src/angle_class2.rst
@@ -64,34 +64,44 @@ These are the 4 coefficients for the :math:`E_a` formula:
 radians internally; hence the various :math:`K` are effectively energy
 per radian\^2 or radian\^3 or radian\^4.
 
-For the :math:`E_{bb}` formula, each line in a :doc:`angle_coeff <angle_coeff>`
-command in the input script lists 4 coefficients, the first of which
-is "bb" to indicate they are BondBond coefficients.  In a data file,
-these coefficients should be listed under a "BondBond Coeffs" heading
-and you must leave out the "bb", i.e. only list 3 coefficients after
-the angle type.
+For the :math:`E_{bb}` formula, each line in a :doc:`angle_coeff
+<angle_coeff>` command in the input script lists 4 coefficients, the
+first of which is "bb" to indicate they are BondBond coefficients.  In
+a data file, these coefficients should be listed under a "BondBond
+Coeffs" heading and you must leave out the "bb", i.e. only list 3
+coefficients after the angle type.
 
 * bb
 * :math:`M` (energy/distance\^2)
 * :math:`r_1` (distance)
 * :math:`r_2` (distance)
 
-For the :math:`E_{ba}` formula, each line in a :doc:`angle_coeff <angle_coeff>`
-command in the input script lists 5 coefficients, the first of which
-is "ba" to indicate they are BondAngle coefficients.  In a data file,
-these coefficients should be listed under a "BondAngle Coeffs" heading
-and you must leave out the "ba", i.e. only list 4 coefficients after
-the angle type.
+For the :math:`E_{ba}` formula, each line in a :doc:`angle_coeff
+<angle_coeff>` command in the input script lists 5 coefficients, the
+first of which is "ba" to indicate they are BondAngle coefficients.
+In a data file, these coefficients should be listed under a "BondAngle
+Coeffs" heading and you must leave out the "ba", i.e. only list 4
+coefficients after the angle type.
 
 * ba
-* :math:`N_1` (energy/distance\^2)
-* :math:`N_2` (energy/distance\^2)
+* :math:`N_1` (energy/distance)
+* :math:`N_2` (energy/distance)
 * :math:`r_1` (distance)
 * :math:`r_2` (distance)
 
 The :math:`\theta_0` value in the :math:`E_{ba}` formula is not specified,
 since it is the same value from the :math:`E_a` formula.
 
+.. note::
+
+   It is important that the order of the I,J,K atoms in each angle
+   listed in the Angles section of the data file read by the
+   :doc:`read_data <read_data>` command be consistent with the order
+   of the :math:`r_1` and :math:`r_2` BondBond and BondAngle
+   coefficients.  This is because the terms in the formulas for
+   :math:`E_{bb}` and :math:`E_{ba}` will use the I,J atoms to compute
+   :math:`r_{ij}` and the J,K atoms to compute :math:`r_{jk}`.
+
 ----------
 
 .. include:: accel_styles.rst
diff --git a/doc/src/bond_fene.rst b/doc/src/bond_fene.rst
index 108f538628..be7775489a 100644
--- a/doc/src/bond_fene.rst
+++ b/doc/src/bond_fene.rst
@@ -1,4 +1,5 @@
 .. index:: bond_style fene
+.. index:: bond_style fene/nm
 .. index:: bond_style fene/intel
 .. index:: bond_style fene/kk
 .. index:: bond_style fene/omp
@@ -8,12 +9,16 @@ bond_style fene command
 
 Accelerator Variants: *fene/intel*, *fene/kk*, *fene/omp*
 
+bond_style fene/nm command
+==========================
+
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
    bond_style fene
+   bond_style fene/nm
 
 Examples
 """"""""
@@ -23,6 +28,9 @@ Examples
    bond_style fene
    bond_coeff 1 30.0 1.5 1.0 1.0
 
+   bond_style fene/nm
+   bond_coeff 1 2.25344 1.5 1.0 1.12246 2 6
+
 Description
 """""""""""
 
@@ -38,16 +46,36 @@ term is attractive, the second Lennard-Jones term is repulsive.  The
 first term extends to :math:`R_0`, the maximum extent of the bond.  The second
 term is cutoff at :math:`2^\frac{1}{6} \sigma`, the minimum of the LJ potential.
 
-The following coefficients must be defined for each bond type via the
-:doc:`bond_coeff <bond_coeff>` command as in the example above, or in
-the data file or restart files read by the :doc:`read_data <read_data>`
-or :doc:`read_restart <read_restart>` commands:
+The *fene/nm* bond style substitutes the standard LJ potential with the generalized LJ potential
+in the same form as in pair style :doc:`nm/cut <pair_nm>`. The bond energy is then given by
+
+.. math::
+
+  E = -0.5 K r_0^2  \ln \left[ 1 - \left(\frac{r}{R_0}\right)^2\right] + \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n \left(\frac{r_0}{r}\right)^m \right]
+
+Similar to the *fene* style, the generalized Lennard-Jones is cut off at
+the potential minimum, :math:`r_0`, to be repulsive only.  The following
+coefficients must be defined for each bond type via the :doc:`bond_coeff
+<bond_coeff>` command as in the example above, or in the data file or
+restart files read by the :doc:`read_data <read_data>` or
+:doc:`read_restart <read_restart>` commands:
 
 * :math:`K` (energy/distance\^2)
 * :math:`R_0` (distance)
 * :math:`\epsilon` (energy)
 * :math:`\sigma` (distance)
 
+For the *fene/nm* style, the following coefficients are used.  Please
+note, that the standard LJ potential and thus the regular FENE potential
+is recovered for (n=12 m=6) and :math:`r_0 = 2^\frac{1}{6} \sigma`.
+
+* :math:`K` (energy/distance\^2)
+* :math:`R_0` (distance)
+* :math:`E_0` (energy)
+* :math:`r_0` (distance)
+* :math:`n` (unitless)
+* :math:`m` (unitless)
+
 ----------
 
 .. include:: accel_styles.rst
@@ -57,9 +85,10 @@ or :doc:`read_restart <read_restart>` commands:
 Restrictions
 """"""""""""
 
-This bond style can only be used if LAMMPS was built with the MOLECULE
-package.  See the :doc:`Build package <Build_package>` page for more
-info.
+The *fene* bond style can only be used if LAMMPS was built with the MOLECULE
+package; the *fene/nm* bond style can only be used if LAMMPS was built
+with the EXTRA-MOLECULE package. See the :doc:`Build package <Build_package>`
+page for more info.
 
 You typically should specify :doc:`special_bonds fene <special_bonds>`
 or :doc:`special_bonds lj/coul 0 1 1 <special_bonds>` to use this bond
@@ -68,7 +97,8 @@ style.  LAMMPS will issue a warning it that's not the case.
 Related commands
 """"""""""""""""
 
-:doc:`bond_coeff <bond_coeff>`, :doc:`delete_bonds <delete_bonds>`
+:doc:`bond_coeff <bond_coeff>`, :doc:`delete_bonds <delete_bonds>`,
+:doc:`pair style lj/cut <pair_lj>`, :doc:`pair style nm/cut <pair_nm>`.
 
 Default
 """""""
diff --git a/doc/src/bond_style.rst b/doc/src/bond_style.rst
index 177dc8cc05..4dee48a78d 100644
--- a/doc/src/bond_style.rst
+++ b/doc/src/bond_style.rst
@@ -87,6 +87,7 @@ accelerated styles exist.
 * :doc:`class2 <bond_class2>` - COMPASS (class 2) bond
 * :doc:`fene <bond_fene>` - FENE (finite-extensible non-linear elastic) bond
 * :doc:`fene/expand <bond_fene_expand>` - FENE bonds with variable size particles
+* :doc:`fene/nm <bond_fene>` - FENE bonds with a generalized Lennard-Jones potential
 * :doc:`gaussian <bond_gaussian>` - multicentered Gaussian-based bond potential
 * :doc:`gromos <bond_gromos>` - GROMOS force field bond
 * :doc:`harmonic <bond_harmonic>` - harmonic bond
diff --git a/doc/src/compute.rst b/doc/src/compute.rst
index 0b8249cc7d..9a501127c2 100644
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@@ -174,6 +174,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`angle <compute_angle>` - energy of each angle sub-style
 * :doc:`angle/local <compute_angle_local>` - theta and energy of each angle
 * :doc:`angmom/chunk <compute_angmom_chunk>` - angular momentum for each chunk
+* :doc:`ave/sphere/atom <compute_ave_sphere_atom>` - compute local density and temperature around each atom
 * :doc:`basal/atom <compute_basal_atom>` - calculates the hexagonal close-packed "c" lattice vector of each atom
 * :doc:`body/local <compute_body_local>` - attributes of body sub-particles
 * :doc:`bond <compute_bond>` - energy of each bond sub-style
diff --git a/doc/src/compute_ave_sphere_atom.rst b/doc/src/compute_ave_sphere_atom.rst
new file mode 100644
index 0000000000..db04682865
--- /dev/null
+++ b/doc/src/compute_ave_sphere_atom.rst
@@ -0,0 +1,101 @@
+.. index:: compute ave/sphere/atom
+.. index:: compute ave/sphere/atom/kk
+
+compute ave/sphere/atom command
+================================
+
+Accelerator Variants: *ave/sphere/atom/kk*
+
+Syntax
+""""""
+
+.. parsed-literal::
+
+   compute ID group-ID ave/sphere/atom keyword values ...
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* ave/sphere/atom = style name of this compute command
+* one or more keyword/value pairs may be appended
+
+  .. parsed-literal::
+
+     keyword = *cutoff*
+       *cutoff* value = distance cutoff
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+   compute 1 all ave/sphere/atom
+
+   compute 1 all ave/sphere/atom cutoff 5.0
+   comm_modify cutoff 5.0
+
+Description
+"""""""""""
+
+Define a computation that calculates the local density and temperature
+for each atom and neighbors inside a spherical cutoff.
+
+The optional keyword *cutoff* defines the distance cutoff
+used when searching for neighbors. The default value is the cutoff
+specified by the pair style. If no pair style is defined, then a cutoff
+must be defined using this keyword. If the specified cutoff is larger than
+that of the pair_style plus neighbor skin (or no pair style is defined),
+the *comm_modify cutoff* option must also be set to match that of the
+*cutoff* keyword.
+
+The neighbor list needed to compute this quantity is constructed each
+time the calculation is performed (i.e. each time a snapshot of atoms
+is dumped).  Thus it can be inefficient to compute/dump this quantity
+too frequently.
+
+.. note::
+
+   If you have a bonded system, then the settings of
+   :doc:`special_bonds <special_bonds>` command can remove pairwise
+   interactions between atoms in the same bond, angle, or dihedral.  This
+   is the default setting for the :doc:`special_bonds <special_bonds>`
+   command, and means those pairwise interactions do not appear in the
+   neighbor list.  Because this fix uses the neighbor list, it also means
+   those pairs will not be included in the order parameter.  This
+   difficulty can be circumvented by writing a dump file, and using the
+   :doc:`rerun <rerun>` command to compute the order parameter for
+   snapshots in the dump file.  The rerun script can use a
+   :doc:`special_bonds <special_bonds>` command that includes all pairs in
+   the neighbor list.
+
+----------
+
+
+.. include:: accel_styles.rst
+
+
+----------
+
+Output info
+"""""""""""
+
+This compute calculates a per-atom array with two columns: density and temperature.
+
+These values can be accessed by any command that uses per-atom values
+from a compute as input.  See the :doc:`Howto output <Howto_output>` doc
+page for an overview of LAMMPS output options.
+
+Restrictions
+""""""""""""
+
+This compute is part of the EXTRA-COMPUTE package.  It is only enabled if
+LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`comm_modify <comm_modify>`
+
+Default
+"""""""
+
+The option defaults are *cutoff* = pair style cutoff
+
diff --git a/doc/src/compute_bond_local.rst b/doc/src/compute_bond_local.rst
index 8bdde70dd9..24b0943484 100644
--- a/doc/src/compute_bond_local.rst
+++ b/doc/src/compute_bond_local.rst
@@ -13,7 +13,7 @@ Syntax
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * bond/local = style name of this compute command
 * one or more values may be appended
-* value = *dist* or *engpot* or *force* or *fx* or *fy* or *fz* or *engvib* or *engrot* or *engtrans* or *omega* or *velvib* or *v_name*
+* value = *dist* or *dx* or *dy* or *dz* or *engpot* or *force* or *fx* or *fy* or *fz* or *engvib* or *engrot* or *engtrans* or *omega* or *velvib* or *v_name*
 
 .. parsed-literal::
 
@@ -21,6 +21,7 @@ Syntax
      *engpot* = bond potential energy
      *force* = bond force
 
+     *dx*,\ *dy*,\ *dz* = components of pairwise distance
      *fx*,\ *fy*,\ *fz* = components of bond force
      *engvib* = bond kinetic energy of vibration
      *engrot* = bond kinetic energy of rotation
@@ -63,6 +64,9 @@ whether the 2 atoms represent a simple diatomic molecule, or are part
 of some larger molecule.
 
 The value *dist* is the current length of the bond.
+The values *dx*, *dy*, and *dz* are the xyz components of the
+*distance* between the pair of atoms. This value is always the
+distance from the atom of lower to the one with the higher id.
 
 The value *engpot* is the potential energy for the bond,
 based on the current separation of the pair of atoms in the bond.
diff --git a/doc/src/compute_heat_flux.rst b/doc/src/compute_heat_flux.rst
index 94d6f09700..56975adc70 100644
--- a/doc/src/compute_heat_flux.rst
+++ b/doc/src/compute_heat_flux.rst
@@ -89,13 +89,20 @@ included in the calculation.
 .. warning::
 
    The compute *heat/flux* has been reported to produce unphysical
-   values for angle, dihedral and improper contributions
+   values for angle, dihedral, improper and constraint force contributions
    when used with :doc:`compute stress/atom <compute_stress_atom>`,
-   as discussed in :ref:`(Surblys) <Surblys2>` and :ref:`(Boone) <Boone>`.
-   You are strongly advised to
+   as discussed in :ref:`(Surblys2019) <Surblys3>`, :ref:`(Boone) <Boone>`
+   and :ref:`(Surblys2021) <Surblys4>`. You are strongly advised to
    use :doc:`compute centroid/stress/atom <compute_stress_atom>`,
    which has been implemented specifically for such cases.
 
+.. warning::
+
+   Due to an implementation detail, the :math:`y` and :math:`z`
+   components of heat flux from :doc:`fix rigid <fix_rigid>`
+   contribution when computed via :doc:`compute stress/atom <compute_stress_atom>`
+   are highly unphysical and should not be used.
+
 The Green-Kubo formulas relate the ensemble average of the
 auto-correlation of the heat flux :math:`\mathbf{J}`
 to the thermal conductivity :math:`\kappa`:
@@ -232,10 +239,14 @@ none
 
 ----------
 
-.. _Surblys2:
+.. _Surblys3:
 
-**(Surblys)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019).
+**(Surblys2019)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019).
 
 .. _Boone:
 
 **(Boone)** Boone, Babaei, Wilmer, J Chem Theory Comput, 15, 5579--5587 (2019).
+
+.. _Surblys4:
+
+**(Surblys2021)** Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021).
diff --git a/doc/src/compute_pair_local.rst b/doc/src/compute_pair_local.rst
index f464c7cec6..38953d203c 100644
--- a/doc/src/compute_pair_local.rst
+++ b/doc/src/compute_pair_local.rst
@@ -13,11 +13,12 @@ Syntax
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * pair/local = style name of this compute command
 * one or more values may be appended
-* value = *dist* or *eng* or *force* or *fx* or *fy* or *fz* or *pN*
+* value = *dist* or *dx* or *dy* or *dz* or *eng* or *force* or *fx* or *fy* or *fz* or *pN*
 
   .. parsed-literal::
 
        *dist* = pairwise distance
+       *dx*,\ *dy*,\ *dz* = components of pairwise distance
        *eng* = pairwise energy
        *force* = pairwise force
        *fx*,\ *fy*,\ *fz* = components of pairwise force
@@ -56,6 +57,9 @@ force cutoff distance for that interaction, as defined by the
 commands.
 
 The value *dist* is the distance between the pair of atoms.
+The values *dx*, *dy*, and *dz* are the xyz components of the
+*distance* between the pair of atoms. This value is always the
+distance from the atom of lower to the one with the higher id.
 
 The value *eng* is the interaction energy for the pair of atoms.
 
@@ -89,10 +93,10 @@ from the second of the two sub-styles.  If the referenced *pN*
 is not computed for the specific pairwise interaction (based on
 atom types), then the output will be 0.0.
 
-The value *dist* will be in distance :doc:`units <units>`.  The value
-*eng* will be in energy :doc:`units <units>`.  The values *force*, *fx*,
-*fy*, and *fz* will be in force :doc:`units <units>`.  The values *pN*
-will be in whatever units the pair style defines.
+The value *dist*, *dx*, *dy* and *dz* will be in distance :doc:`units <units>`.
+The value *eng* will be in energy :doc:`units <units>`.
+The values *force*, *fx*, *fy*, and *fz* will be in force :doc:`units <units>`.
+The values *pN* will be in whatever units the pair style defines.
 
 The optional *cutoff* keyword determines how the force cutoff distance
 for an interaction is determined.  For the default setting of *type*,
diff --git a/doc/src/compute_stress_atom.rst b/doc/src/compute_stress_atom.rst
index cdb464a9d0..2c8be0c05a 100644
--- a/doc/src/compute_stress_atom.rst
+++ b/doc/src/compute_stress_atom.rst
@@ -87,6 +87,10 @@ Tersoff 3-body interaction) is assigned in equal portions to each atom
 in the set.  E.g. 1/4 of the dihedral virial to each of the 4 atoms,
 or 1/3 of the fix virial due to SHAKE constraints applied to atoms in
 a water molecule via the :doc:`fix shake <fix_shake>` command.
+As an exception, the virial contribution from
+constraint forces in :doc:`fix rigid <fix_rigid>` on each atom
+is computed from the constraint force acting on the corresponding atom
+and its position, i.e. the total virial is not equally distributed.
 
 In case of compute *centroid/stress/atom*, the virial contribution is:
 
@@ -103,13 +107,25 @@ atom :math:`I` due to the interaction and the relative position
 :math:`\mathbf{r}_{I0}` of the atom :math:`I` to the geometric center
 of the interacting atoms, i.e. centroid, is used.  As the geometric
 center is different for each interaction, the :math:`\mathbf{r}_{I0}`
-also differs.  The sixth and seventh terms, Kspace and :doc:`fix
-<fix>` contribution respectively, are computed identical to compute
-*stress/atom*.  Although the total system virial is the same as
+also differs. The sixth term, Kspace contribution,
+is computed identically to compute *stress/atom*.
+The seventh term is handed differently depending on
+if the constraint forces are due to :doc:`fix shake <fix_shake>`
+or :doc:`fix rigid <fix_rigid>`.
+In case of SHAKE constraints, each distance constraint is
+handed as a pairwise interaction.
+E.g. in case of a water molecule, two OH and one HH distance
+constraints are treated as three pairwise interactions.
+In case of :doc:`fix rigid <fix_rigid>`,
+all constraint forces in the molecule are treated
+as a single many-body interaction with a single centroid position.
+In case of water molecule, the formula expression would become
+identical to that of the three-body angle interaction.
+Although the total system virial is the same as
 compute *stress/atom*, compute *centroid/stress/atom* is know to
-result in more consistent heat flux values for angle, dihedrals and
-improper contributions when computed via :doc:`compute heat/flux
-<compute_heat_flux>`.
+result in more consistent heat flux values for angle, dihedrals,
+improper and constraint force contributions
+when computed via :doc:`compute heat/flux <compute_heat_flux>`.
 
 If no extra keywords are listed, the kinetic contribution all of the
 virial contribution terms are included in the per-atom stress tensor.
@@ -134,7 +150,8 @@ contribution for the cluster interaction is divided evenly among those
 atoms.
 
 Details of how compute *centroid/stress/atom* obtains the virial for
-individual atoms is given in :ref:`(Surblys) <Surblys1>`, where the
+individual atoms are given in :ref:`(Surblys2019) <Surblys1>` and
+:ref:`(Surblys2021) <Surblys2>`, where the
 idea is that the virial of the atom :math:`I` is the result of only
 the force :math:`\mathbf{F}_I` on the atom due to the interaction and
 its positional vector :math:`\mathbf{r}_{I0}`, relative to the
@@ -235,10 +252,10 @@ between the pair of particles.  All bond styles are supported.  All
 angle, dihedral, improper styles are supported with the exception of
 INTEL and KOKKOS variants of specific styles.  It also does not
 support models with long-range Coulombic or dispersion forces,
-i.e. the kspace_style command in LAMMPS.  It also does not support the
-following fixes which add rigid-body constraints: :doc:`fix shake
-<fix_shake>`, :doc:`fix rattle <fix_shake>`, :doc:`fix rigid
-<fix_rigid>`, :doc:`fix rigid/small <fix_rigid>`.
+i.e. the kspace_style command in LAMMPS.  It also does not implement the
+following fixes which add rigid-body constraints:
+:doc:`fix rigid/* <fix_rigid>` and the OpenMP accelerated version of :doc:`fix rigid/small <fix_rigid>`,
+while all other :doc:`fix rigid/*/small <fix_rigid>` are implemented.
 
 LAMMPS will generate an error if one of these options is included in
 your model.  Extension of centroid stress calculations to these force
@@ -270,4 +287,8 @@ none
 
 .. _Surblys1:
 
-**(Surblys)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019).
+**(Surblys2019)** Surblys, Matsubara, Kikugawa, Ohara, Phys Rev E, 99, 051301(R) (2019).
+
+.. _Surblys2:
+
+**(Surblys2021)** Surblys, Matsubara, Kikugawa, Ohara, J Appl Phys 130, 215104 (2021).
diff --git a/doc/src/delete_atoms.rst b/doc/src/delete_atoms.rst
index d47743071b..f78f295011 100644
--- a/doc/src/delete_atoms.rst
+++ b/doc/src/delete_atoms.rst
@@ -20,8 +20,10 @@ Syntax
          cutoff = delete one atom from pairs of atoms within the cutoff (distance units)
          group1-ID = one atom in pair must be in this group
          group2-ID = other atom in pair must be in this group
-       *porosity* args = region-ID fraction seed
+       *porosity* args = group-ID region-ID fraction seed
+         group-ID = group within which to perform deletions
          region-ID = region within which to perform deletions
+                     or NULL to only impose the group criterion
          fraction = delete this fraction of atoms
          seed = random number seed (positive integer)
 
@@ -43,7 +45,8 @@ Examples
    delete_atoms region sphere compress no
    delete_atoms overlap 0.3 all all
    delete_atoms overlap 0.5 solvent colloid
-   delete_atoms porosity cube 0.1 482793 bond yes
+   delete_atoms porosity all cube 0.1 482793 bond yes
+   delete_atoms porosity polymer cube 0.1 482793 bond yes
 
 Description
 """""""""""
@@ -76,12 +79,17 @@ have occurred that no atom pairs within the cutoff will remain
 minimum number of atoms will be deleted, or that the same atoms will
 be deleted when running on different numbers of processors.
 
-For style *porosity* a specified *fraction* of atoms are deleted
-within the specified region.  For example, if fraction is 0.1, then
-10% of the atoms will be deleted.  The atoms to delete are chosen
-randomly.  There is no guarantee that the exact fraction of atoms will
-be deleted, or that the same atoms will be deleted when running on
-different numbers of processors.
+For style *porosity* a specified *fraction* of atoms are deleted which
+are both in the specified group and within the specified region.  The
+region-ID can be specified as NULL to only impose the group criterion.
+Likewise, specifying the group-ID as *all* will only impose the region
+criterion.
+
+For example, if fraction is 0.1, then 10% of the eligible atoms will
+be deleted.  The atoms to delete are chosen randomly.  There is no
+guarantee that the exact fraction of atoms will be deleted, or that
+the same atoms will be deleted when running on different numbers of
+processors.
 
 If the *compress* keyword is set to *yes*, then after atoms are
 deleted, then atom IDs are re-assigned so that they run from 1 to the
@@ -89,8 +97,8 @@ number of atoms in the system.  Note that this is not done for
 molecular systems (see the :doc:`atom_style <atom_style>` command),
 regardless of the *compress* setting, since it would foul up the bond
 connectivity that has already been assigned.  However, the
-:doc:`reset_atom_ids <reset_atom_ids>` command can be used after this command to
-accomplish the same thing.
+:doc:`reset_atom_ids <reset_atom_ids>` command can be used after this
+command to accomplish the same thing.
 
 Note that the re-assignment of IDs is not really a compression, where
 gaps in atom IDs are removed by decrementing atom IDs that are larger.
@@ -100,15 +108,15 @@ the :doc:`create_atoms <create_atoms>` command explains.
 
 A molecular system with fixed bonds, angles, dihedrals, or improper
 interactions, is one where the topology of the interactions is
-typically defined in the data file read by the
-:doc:`read_data <read_data>` command, and where the interactions
-themselves are defined with the :doc:`bond_style <bond_style>`,
-:doc:`angle_style <angle_style>`, etc commands.  If you delete atoms
-from such a system, you must be careful not to end up with bonded
-interactions that are stored by remaining atoms but which include
-deleted atoms.  This will cause LAMMPS to generate a "missing atoms"
-error when the bonded interaction is computed.  The *bond* and *mol*
-keywords offer two ways to do that.
+typically defined in the data file read by the :doc:`read_data
+<read_data>` command, and where the interactions themselves are
+defined with the :doc:`bond_style <bond_style>`, :doc:`angle_style
+<angle_style>`, etc commands.  If you delete atoms from such a system,
+you must be careful not to end up with bonded interactions that are
+stored by remaining atoms but which include deleted atoms.  This will
+cause LAMMPS to generate a "missing atoms" error when the bonded
+interaction is computed.  The *bond* and *mol* keywords offer two ways
+to do that.
 
 It the *bond* keyword is set to *yes* then any bond or angle or
 dihedral or improper interaction that includes a deleted atom is also
diff --git a/doc/src/dump.rst b/doc/src/dump.rst
index c2509e6654..c94813a41e 100644
--- a/doc/src/dump.rst
+++ b/doc/src/dump.rst
@@ -137,7 +137,7 @@ Examples
    dump myDump all atom/gz 100 dump.atom.gz
    dump myDump all atom/zstd 100 dump.atom.zst
    dump 2 subgroup atom 50 dump.run.bin
-   dump 2 subgroup atom 50 dump.run.mpiio.bin
+   dump 2 subgroup atom/mpiio 50 dump.run.mpiio.bin
    dump 4a all custom 100 dump.myforce.* id type x y vx fx
    dump 4b flow custom 100 dump.%.myforce id type c_myF[3] v_ke
    dump 4b flow custom 100 dump.%.myforce id type c_myF[*] v_ke
@@ -169,11 +169,12 @@ or multiple smaller files).
 
 .. note::
 
-   Because periodic boundary conditions are enforced only on
-   timesteps when neighbor lists are rebuilt, the coordinates of an atom
-   written to a dump file may be slightly outside the simulation box.
-   Re-neighbor timesteps will not typically coincide with the timesteps
-   dump snapshots are written.  See the :doc:`dump_modify pbc <dump_modify>` command if you with to force coordinates to be
+   Because periodic boundary conditions are enforced only on timesteps
+   when neighbor lists are rebuilt, the coordinates of an atom written
+   to a dump file may be slightly outside the simulation box.
+   Re-neighbor timesteps will not typically coincide with the
+   timesteps dump snapshots are written.  See the :doc:`dump_modify
+   pbc <dump_modify>` command if you with to force coordinates to be
    strictly inside the simulation box.
 
 .. note::
@@ -189,20 +190,21 @@ or multiple smaller files).
    multiple processors, each of which owns a subset of the atoms.
 
 For the *atom*, *custom*, *cfg*, and *local* styles, sorting is off by
-default.  For the *dcd*, *xtc*, *xyz*, and *molfile* styles, sorting by
-atom ID is on by default. See the :doc:`dump_modify <dump_modify>` doc
-page for details.
+default.  For the *dcd*, *xtc*, *xyz*, and *molfile* styles, sorting
+by atom ID is on by default. See the :doc:`dump_modify <dump_modify>`
+doc page for details.
 
-The *atom/gz*, *cfg/gz*, *custom/gz*, *local/gz*, and *xyz/gz* styles are identical
-in command syntax to the corresponding styles without "gz", however,
-they generate compressed files using the zlib library. Thus the filename
-suffix ".gz" is mandatory. This is an alternative approach to writing
-compressed files via a pipe, as done by the regular dump styles, which
-may be required on clusters where the interface to the high-speed network
-disallows using the fork() library call (which is needed for a pipe).
-For the remainder of this doc page, you should thus consider the *atom*
-and *atom/gz* styles (etc) to be inter-changeable, with the exception
-of the required filename suffix.
+The *atom/gz*, *cfg/gz*, *custom/gz*, *local/gz*, and *xyz/gz* styles
+are identical in command syntax to the corresponding styles without
+"gz", however, they generate compressed files using the zlib
+library. Thus the filename suffix ".gz" is mandatory. This is an
+alternative approach to writing compressed files via a pipe, as done
+by the regular dump styles, which may be required on clusters where
+the interface to the high-speed network disallows using the fork()
+library call (which is needed for a pipe).  For the remainder of this
+doc page, you should thus consider the *atom* and *atom/gz* styles
+(etc) to be inter-changeable, with the exception of the required
+filename suffix.
 
 Similarly, the *atom/zstd*, *cfg/zstd*, *custom/zstd*, *local/zstd*,
 and *xyz/zstd* styles are identical to the gz styles, but use the Zstd
@@ -219,6 +221,11 @@ you should thus consider the *atom* and *atom/mpiio* styles (etc) to
 be inter-changeable.  The one exception is how the filename is
 specified for the MPI-IO styles, as explained below.
 
+.. warning::
+
+   The MPIIO package is currently unmaintained and has become
+   unreliable. Use with caution.
+
 The precision of values output to text-based dump files can be
 controlled by the :doc:`dump_modify format <dump_modify>` command and
 its options.
@@ -275,10 +282,11 @@ This bounding box is convenient for many visualization programs.  The
 meaning of the 6 character flags for "xx yy zz" is the same as above.
 
 Note that the first two numbers on each line are now xlo_bound instead
-of xlo, etc, since they represent a bounding box.  See the :doc:`Howto triclinic <Howto_triclinic>` page for a geometric description
-of triclinic boxes, as defined by LAMMPS, simple formulas for how the
-6 bounding box extents (xlo_bound,xhi_bound,etc) are calculated from
-the triclinic parameters, and how to transform those parameters to and
+of xlo, etc, since they represent a bounding box.  See the :doc:`Howto
+triclinic <Howto_triclinic>` page for a geometric description of
+triclinic boxes, as defined by LAMMPS, simple formulas for how the 6
+bounding box extents (xlo_bound,xhi_bound,etc) are calculated from the
+triclinic parameters, and how to transform those parameters to and
 from other commonly used triclinic representations.
 
 The "ITEM: ATOMS" line in each snapshot lists column descriptors for
@@ -310,23 +318,24 @@ written to the dump file.  This local data is typically calculated by
 each processor based on the atoms it owns, but there may be zero or
 more entities per atom, e.g. a list of bond distances.  An explanation
 of the possible dump local attributes is given below.  Note that by
-using input from the :doc:`compute property/local <compute_property_local>` command with dump local,
-it is possible to generate information on bonds, angles, etc that can
-be cut and pasted directly into a data file read by the
-:doc:`read_data <read_data>` command.
+using input from the :doc:`compute property/local
+<compute_property_local>` command with dump local, it is possible to
+generate information on bonds, angles, etc that can be cut and pasted
+directly into a data file read by the :doc:`read_data <read_data>`
+command.
 
 Style *cfg* has the same command syntax as style *custom* and writes
-extended CFG format files, as used by the
-`AtomEye <http://li.mit.edu/Archive/Graphics/A/>`_ visualization
-package.  Since the extended CFG format uses a single snapshot of the
-system per file, a wildcard "\*" must be included in the filename, as
-discussed below.  The list of atom attributes for style *cfg* must
-begin with either "mass type xs ys zs" or "mass type xsu ysu zsu"
-since these quantities are needed to write the CFG files in the
-appropriate format (though the "mass" and "type" fields do not appear
-explicitly in the file).  Any remaining attributes will be stored as
-"auxiliary properties" in the CFG files.  Note that you will typically
-want to use the :doc:`dump_modify element <dump_modify>` command with
+extended CFG format files, as used by the `AtomEye
+<http://li.mit.edu/Archive/Graphics/A/>`_ visualization package.
+Since the extended CFG format uses a single snapshot of the system per
+file, a wildcard "\*" must be included in the filename, as discussed
+below.  The list of atom attributes for style *cfg* must begin with
+either "mass type xs ys zs" or "mass type xsu ysu zsu" since these
+quantities are needed to write the CFG files in the appropriate format
+(though the "mass" and "type" fields do not appear explicitly in the
+file).  Any remaining attributes will be stored as "auxiliary
+properties" in the CFG files.  Note that you will typically want to
+use the :doc:`dump_modify element <dump_modify>` command with
 CFG-formatted files, to associate element names with atom types, so
 that AtomEye can render atoms appropriately. When unwrapped
 coordinates *xsu*, *ysu*, and *zsu* are requested, the nominal AtomEye
@@ -452,6 +461,11 @@ use the :doc:`read_dump <read_dump>` command or perform other
 post-processing, just as if the dump file was not written using
 MPI-IO.
 
+.. warning::
+
+   The MPIIO package is currently unmaintained and has become
+   unreliable. Use with caution.
+
 Note that MPI-IO dump files are one large file which all processors
 write to.  You thus cannot use the "%" wildcard character described
 above in the filename since that specifies generation of multiple
@@ -708,8 +722,9 @@ are part of the MPIIO package.  They are only enabled if LAMMPS was
 built with that package.  See the :doc:`Build package <Build_package>`
 doc page for more info.
 
-The *xtc* style is part of the MISC package.  It is only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+The *xtc* and *dcd* styles are part of the EXTRA-DUMP package.  They
+are only enabled if LAMMPS was built with that package.  See the
+:doc:`Build package <Build_package>` page for more info.
 
 Related commands
 """"""""""""""""
diff --git a/doc/src/dump_image.rst b/doc/src/dump_image.rst
index be14a237e5..9b8c7febf4 100644
--- a/doc/src/dump_image.rst
+++ b/doc/src/dump_image.rst
@@ -6,6 +6,8 @@ dump image command
 dump movie command
 ==================
 
+(see below for :ref:`dump_modify options <dump_modify_image>` specific to dump image/movie)
+
 Syntax
 """"""
 
@@ -15,7 +17,7 @@ Syntax
 
 * ID = user-assigned name for the dump
 * group-ID = ID of the group of atoms to be imaged
-* style = *image* or *movie* = style of dump command (other styles *atom* or *cfg* or *dcd* or *xtc* or *xyz* or *local* or *custom* are discussed on the :doc:`dump <dump>` doc page)
+* style = *image* or *movie* = style of dump command (other styles such as *atom* or *cfg* or *dcd* or *xtc* or *xyz* or *local* or *custom* are discussed on the :doc:`dump <dump>` doc page)
 * N = dump every this many timesteps
 * file = name of file to write image to
 * color = atom attribute that determines color of each atom
@@ -79,6 +81,69 @@ Syntax
          seed = random # seed (positive integer)
          dfactor = strength of shading from 0.0 to 1.0
 
+
+.. _dump_modify_image:
+
+dump_modify options for dump image/movie
+========================================
+
+Syntax
+""""""
+
+.. parsed-literal::
+
+   dump_modify dump-ID keyword values ...
+
+* these keywords apply only to the *image* and *movie* styles and are documented on this page
+* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate*
+* see the :doc:`dump modify <dump_modify>` doc page for more general keywords
+
+  .. parsed-literal::
+
+       *acolor* args = type color
+         type = atom type or range of types (see below)
+         color = name of color or color1/color2/...
+       *adiam* args = type diam
+         type = atom type or range of types (see below)
+         diam = diameter of atoms of that type (distance units)
+       *amap* args = lo hi style delta N entry1 entry2 ... entryN
+         lo = number or *min* = lower bound of range of color map
+         hi = number or *max* = upper bound of range of color map
+         style = 2 letters = "c" or "d" or "s" plus "a" or "f"
+           "c" for continuous
+           "d" for discrete
+           "s" for sequential
+           "a" for absolute
+           "f" for fractional
+         delta = binsize (only used for style "s", otherwise ignored)
+           binsize = range is divided into bins of this width
+         N = # of subsequent entries
+         entry = value color (for continuous style)
+           value = number or *min* or *max* = single value within range
+           color = name of color used for that value
+         entry = lo hi color (for discrete style)
+           lo/hi = number or *min* or *max* = lower/upper bound of subset of range
+           color = name of color used for that subset of values
+         entry = color (for sequential style)
+           color = name of color used for a bin of values
+       *backcolor* arg = color
+         color = name of color for background
+       *bcolor* args = type color
+         type = bond type or range of types (see below)
+         color = name of color or color1/color2/...
+       *bdiam* args = type diam
+         type = bond type or range of types (see below)
+         diam = diameter of bonds of that type (distance units)
+       *boxcolor* arg = color
+         color = name of color for simulation box lines and processor sub-domain lines
+       *color* args = name R G B
+         name = name of color
+         R,G,B = red/green/blue numeric values from 0.0 to 1.0
+       *bitrate* arg = rate
+         rate = target bitrate for movie in kbps
+       *framerate* arg = fps
+         fps = frames per second for movie
+
 Examples
 """"""""
 
@@ -91,6 +156,8 @@ Examples
    dump m1 all movie 1000 movie.avi type type size 640 480
    dump m2 all movie 100 movie.m4v type type zoom 1.8 adiam v_value size 1280 720
 
+   dump_modify 1 amap min max cf 0.0 3 min green 0.5 yellow max blue boxcolor red
+
 Description
 """""""""""
 
@@ -145,10 +212,10 @@ is used.
 Similarly, the format of the resulting movie is chosen with the
 *movie* dump style. This is handled by the underlying FFmpeg converter
 and thus details have to be looked up in the `FFmpeg documentation
-<http://ffmpeg.org/ffmpeg.html>`_.
-Typical examples are: .avi, .mpg, .m4v, .mp4, .mkv, .flv, .mov, .gif
-Additional settings of the movie compression like bitrate and
-framerate can be set using the :doc:`dump_modify <dump_modify>` command.
+<http://ffmpeg.org/ffmpeg.html>`_.  Typical examples are: .avi, .mpg,
+.m4v, .mp4, .mkv, .flv, .mov, .gif Additional settings of the movie
+compression like bitrate and framerate can be set using the
+dump_modify command as described below.
 
 To write out JPEG and PNG format files, you must build LAMMPS with
 support for the corresponding JPEG or PNG library. To convert images
@@ -210,19 +277,20 @@ to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for types > 6.  This mapping can be changed by the
-:doc:`dump_modify acolor <dump_modify>` command.
+"dump_modify acolor" command, as described below.
 
 If *type* is specified for the *diameter* setting then the diameter of
 each atom is determined by its atom type.  By default all types have
-diameter 1.0.  This mapping can be changed by the :doc:`dump_modify adiam <dump_modify>` command.
+diameter 1.0.  This mapping can be changed by the "dump_modify adiam"
+command, as described below.
 
 If *element* is specified for the *color* and/or *diameter* setting,
 then the color and/or diameter of each atom is determined by which
 element it is, which in turn is specified by the element-to-type
-mapping specified by the "dump_modify element" command.  By default
-every atom type is C (carbon).  Every element has a color and diameter
-associated with it, which is the same as the colors and sizes used by
-the `AtomEye <atomeye_>`_ visualization package.
+mapping specified by the "dump_modify element" command, as described
+below.  By default every atom type is C (carbon).  Every element has a
+color and diameter associated with it, which is the same as the colors
+and sizes used by the `AtomEye <atomeye_>`_ visualization package.
 
 .. _atomeye: http://li.mit.edu/Archive/Graphics/A/
 
@@ -232,13 +300,13 @@ settings, they are interpreted in the following way.
 If "vx", for example, is used as the *color* setting, then the color
 of the atom will depend on the x-component of its velocity.  The
 association of a per-atom value with a specific color is determined by
-a "color map", which can be specified via the
-:doc:`dump_modify <dump_modify>` command.  The basic idea is that the
-atom-attribute will be within a range of values, and every value
-within the range is mapped to a specific color.  Depending on how the
-color map is defined, that mapping can take place via interpolation so
-that a value of -3.2 is halfway between "red" and "blue", or
-discretely so that the value of -3.2 is "orange".
+a "color map", which can be specified via the dump_modify command, as
+described below.  The basic idea is that the atom-attribute will be
+within a range of values, and every value within the range is mapped
+to a specific color.  Depending on how the color map is defined, that
+mapping can take place via interpolation so that a value of -3.2 is
+halfway between "red" and "blue", or discretely so that the value of
+-3.2 is "orange".
 
 If "vx", for example, is used as the *diameter* setting, then the atom
 will be rendered using the x-component of its velocity as the
@@ -251,9 +319,10 @@ diameter, which can be used as the *diameter* setting.
 
 The various keywords listed above control how the image is rendered.
 As listed below, all of the keywords have defaults, most of which you
-will likely not need to change.  The :doc:`dump modify <dump_modify>`
-also has options specific to the dump image style, particularly for
-assigning colors to atoms, bonds, and other image features.
+will likely not need to change.  As described below, the dump modify
+command also has options specific to the dump image style,
+particularly for assigning colors to atoms, bonds, and other image
+features.
 
 ----------
 
@@ -295,7 +364,7 @@ types to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for bond types > 6.  This mapping can be changed by
-the :doc:`dump_modify bcolor <dump_modify>` command.
+the "dump_modify bcolor" command, as described below.
 
 The bond *width* value can be a numeric value or *atom* or *type* (or
 *none* as indicated above).
@@ -310,7 +379,8 @@ of the 2 atoms in the bond.
 
 If *type* is specified for the *width* value then the diameter of each
 bond is determined by its bond type.  By default all types have
-diameter 0.5.  This mapping can be changed by the :doc:`dump_modify bdiam <dump_modify>` command.
+diameter 0.5.  This mapping can be changed by the "dump_modify bdiam" command,
+as described below.
 
 ----------
 
@@ -330,7 +400,7 @@ mapping of types to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for types > 6.  There is not yet an option to
-change this via the :doc:`dump_modify <dump_modify>` command.
+change this via the dump_modify command.
 
 The line *width* can only be a numeric value, which specifies that all
 lines will be drawn as cylinders with that diameter, e.g. 1.0, which
@@ -357,7 +427,7 @@ default the mapping of types to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for types > 6.  There is not yet an option to
-change this via the :doc:`dump_modify <dump_modify>` command.
+change this via the dump_modify command.
 
 ----------
 
@@ -390,7 +460,7 @@ particle.  By default the mapping of types to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for types > 6.  There is not yet an option to
-change this via the :doc:`dump_modify <dump_modify>` command.
+change this via the dump_modify command.
 
 ----------
 
@@ -414,7 +484,7 @@ the mapping of types to colors is as follows:
 * type 6 = cyan
 
 and repeats itself for types > 6.  There is not yet an option to
-change this via the :doc:`dump_modify <dump_modify>` command.
+change this via the dump_modify command.
 
 ----------
 
@@ -488,7 +558,8 @@ are rendered as thin cylinders in the image.  If *no* is set, then the
 box boundaries are not drawn and the *diam* setting is ignored.  If
 *yes* is set, the 12 edges of the box are drawn, with a diameter that
 is a fraction of the shortest box length in x,y,z (for 3d) or x,y (for
-2d).  The color of the box boundaries can be set with the :doc:`dump_modify boxcolor <dump_modify>` command.
+2d).  The color of the box boundaries can be set with the "dump_modify
+boxcolor" command.
 
 The *axes* keyword determines if and how the coordinate axes are
 rendered as thin cylinders in the image.  If *no* is set, then the
@@ -507,7 +578,8 @@ set (default), then the sub-domain boundaries are not drawn and the
 *diam* setting is ignored.  If *yes* is set, the 12 edges of each
 processor sub-domain are drawn, with a diameter that is a fraction of
 the shortest box length in x,y,z (for 3d) or x,y (for 2d).  The color
-of the sub-domain boundaries can be set with the :doc:`dump_modify boxcolor <dump_modify>` command.
+of the sub-domain boundaries can be set with the "dump_modify
+boxcolor" command.
 
 ----------
 
@@ -607,9 +679,272 @@ Play the movie:
 
 ----------
 
-See the :doc:`Modify <Modify>` page for information on how to add
-new compute and fix styles to LAMMPS to calculate per-atom quantities
-which could then be output into dump files.
+Dump_modify keywords for dump image and dump movie
+""""""""""""""""""""""""""""""""""""""""""""""""""
+
+The following dump_modify keywords apply only to the dump image and
+dump movie styles.  Any keyword that works with dump image also works
+with dump movie, since the movie is simply a collection of images.
+Some of the keywords only affect the dump movie style.  The
+descriptions give details.
+
+----------
+
+The *acolor* keyword can be used with the dump image command, when its
+atom color setting is *type*, to set the color that atoms of each type
+will be drawn in the image.
+
+The specified *type* should be an integer from 1 to Ntypes = the
+number of atom types.  A wildcard asterisk can be used in place of or
+in conjunction with the *type* argument to specify a range of atom
+types.  This takes the form "\*" or "\*n" or "n\*" or "m\*n".  If N =
+the number of atom types, then an asterisk with no numeric values
+means all types from 1 to N.  A leading asterisk means all types from
+1 to n (inclusive).  A trailing asterisk means all types from n to N
+(inclusive).  A middle asterisk means all types from m to n
+(inclusive).
+
+The specified *color* can be a single color which is any of the 140
+pre-defined colors (see below) or a color name defined by the
+"dump_modify color" command, as described below.  Or it can be two or
+more colors separated by a "/" character, e.g. red/green/blue.  In the
+former case, that color is assigned to all the specified atom types.
+In the latter case, the list of colors are assigned in a round-robin
+fashion to each of the specified atom types.
+
+----------
+
+The *adiam* keyword can be used with the dump image command, when its
+atom diameter setting is *type*, to set the size that atoms of each
+type will be drawn in the image.  The specified *type* should be an
+integer from 1 to Ntypes.  As with the *acolor* keyword, a wildcard
+asterisk can be used as part of the *type* argument to specify a range
+of atom types.  The specified *diam* is the size in whatever distance
+:doc:`units <units>` the input script is using, e.g. Angstroms.
+
+----------
+
+The *amap* keyword can be used with the dump image command, with its
+*atom* keyword, when its atom setting is an atom-attribute, to setup a
+color map.  The color map is used to assign a specific RGB
+(red/green/blue) color value to an individual atom when it is drawn,
+based on the atom's attribute, which is a numeric value, e.g. its
+x-component of velocity if the atom-attribute "vx" was specified.
+
+The basic idea of a color map is that the atom-attribute will be
+within a range of values, and that range is associated with a series
+of colors (e.g. red, blue, green).  An atom's specific value (vx =
+-3.2) can then mapped to the series of colors (e.g. halfway between
+red and blue), and a specific color is determined via an interpolation
+procedure.
+
+There are many possible options for the color map, enabled by the
+*amap* keyword.  Here are the details.
+
+The *lo* and *hi* settings determine the range of values allowed for
+the atom attribute.  If numeric values are used for *lo* and/or *hi*,
+then values that are lower/higher than that value are set to the
+value.  I.e. the range is static.  If *lo* is specified as *min* or
+*hi* as *max* then the range is dynamic, and the lower and/or
+upper bound will be calculated each time an image is drawn, based
+on the set of atoms being visualized.
+
+The *style* setting is two letters, such as "ca".  The first letter is
+either "c" for continuous, "d" for discrete, or "s" for sequential.
+The second letter is either "a" for absolute, or "f" for fractional.
+
+A continuous color map is one in which the color changes continuously
+from value to value within the range.  A discrete color map is one in
+which discrete colors are assigned to sub-ranges of values within the
+range.  A sequential color map is one in which discrete colors are
+assigned to a sequence of sub-ranges of values covering the entire
+range.
+
+An absolute color map is one in which the values to which colors are
+assigned are specified explicitly as values within the range.  A
+fractional color map is one in which the values to which colors are
+assigned are specified as a fractional portion of the range.  For
+example if the range is from -10.0 to 10.0, and the color red is to be
+assigned to atoms with a value of 5.0, then for an absolute color map
+the number 5.0 would be used.  But for a fractional map, the number
+0.75 would be used since 5.0 is 3/4 of the way from -10.0 to 10.0.
+
+The *delta* setting must be specified for all styles, but is only used
+for the sequential style; otherwise the value is ignored.  It
+specifies the bin size to use within the range for assigning
+consecutive colors to.  For example, if the range is from -10.0 to
+10.0 and a *delta* of 1.0 is used, then 20 colors will be assigned to
+the range.  The first will be from -10.0 <= color1 < -9.0, then second
+from -9.0 <= color2 < -8.0, etc.
+
+The *N* setting is how many entries follow.  The format of the entries
+depends on whether the color map style is continuous, discrete or
+sequential.  In all cases the *color* setting can be any of the 140
+pre-defined colors (see below) or a color name defined by the
+dump_modify color option.
+
+For continuous color maps, each entry has a *value* and a *color*\ .
+The *value* is either a number within the range of values or *min* or
+*max*\ .  The *value* of the first entry must be *min* and the *value*
+of the last entry must be *max*\ .  Any entries in between must have
+increasing values.  Note that numeric values can be specified either
+as absolute numbers or as fractions (0.0 to 1.0) of the range,
+depending on the "a" or "f" in the style setting for the color map.
+
+Here is how the entries are used to determine the color of an
+individual atom, given the value X of its atom attribute.  X will fall
+between 2 of the entry values.  The color of the atom is linearly
+interpolated (in each of the RGB values) between the 2 colors
+associated with those entries.  For example, if X = -5.0 and the 2
+surrounding entries are "red" at -10.0 and "blue" at 0.0, then the
+atom's color will be halfway between "red" and "blue", which happens
+to be "purple".
+
+For discrete color maps, each entry has a *lo* and *hi* value and a
+*color*\ .  The *lo* and *hi* settings are either numbers within the
+range of values or *lo* can be *min* or *hi* can be *max*\ .  The *lo*
+and *hi* settings of the last entry must be *min* and *max*\ .  Other
+entries can have any *lo* and *hi* values and the sub-ranges of
+different values can overlap.  Note that numeric *lo* and *hi* values
+can be specified either as absolute numbers or as fractions (0.0 to
+1.0) of the range, depending on the "a" or "f" in the style setting
+for the color map.
+
+Here is how the entries are used to determine the color of an
+individual atom, given the value X of its atom attribute.  The entries
+are scanned from first to last.  The first time that *lo* <= X <=
+*hi*, X is assigned the color associated with that entry.  You can
+think of the last entry as assigning a default color (since it will
+always be matched by X), and the earlier entries as colors that
+override the default.  Also note that no interpolation of a color RGB
+is done.  All atoms will be drawn with one of the colors in the list
+of entries.
+
+For sequential color maps, each entry has only a *color*\ .  Here is how
+the entries are used to determine the color of an individual atom,
+given the value X of its atom attribute.  The range is partitioned
+into N bins of width *binsize*\ .  Thus X will fall in a specific bin
+from 1 to N, say the Mth bin.  If it falls on a boundary between 2
+bins, it is considered to be in the higher of the 2 bins.  Each bin is
+assigned a color from the E entries.  If E < N, then the colors are
+repeated.  For example if 2 entries with colors red and green are
+specified, then the odd numbered bins will be red and the even bins
+green.  The color of the atom is the color of its bin.  Note that the
+sequential color map is really a shorthand way of defining a discrete
+color map without having to specify where all the bin boundaries are.
+
+Here is an example of using a sequential color map to color all the
+atoms in individual molecules with a different color.  See the
+examples/pour/in.pour.2d.molecule input script for an example of how
+this is used.
+
+.. code-block:: LAMMPS
+
+   variable        colors string &
+                   "red green blue yellow white &
+                   purple pink orange lime gray"
+   variable        mol atom mol%10
+   dump            1 all image 250 image.*.jpg v_mol type &
+                   zoom 1.6 adiam 1.5
+   dump_modify     1 pad 5 amap 0 10 sa 1 10 ${colors}
+
+In this case, 10 colors are defined, and molecule IDs are
+mapped to one of the colors, even if there are 1000s of molecules.
+
+----------
+
+The *backcolor* sets the background color of the images.  The color
+name can be any of the 140 pre-defined colors (see below) or a color
+name defined by the dump_modify color option.
+
+----------
+
+The *bcolor* keyword can be used with the dump image command, with its
+*bond* keyword, when its color setting is *type*, to set the color
+that bonds of each type will be drawn in the image.
+
+The specified *type* should be an integer from 1 to Nbondtypes = the
+number of bond types.  A wildcard asterisk can be used in place of or
+in conjunction with the *type* argument to specify a range of bond
+types.  This takes the form "\*" or "\*n" or "n\*" or "m\*n".  If N =
+the number of bond types, then an asterisk with no numeric values
+means all types from 1 to N.  A leading asterisk means all types from
+1 to n (inclusive).  A trailing asterisk means all types from n to N
+(inclusive).  A middle asterisk means all types from m to n
+(inclusive).
+
+The specified *color* can be a single color which is any of the 140
+pre-defined colors (see below) or a color name defined by the
+dump_modify color option.  Or it can be two or more colors separated
+by a "/" character, e.g. red/green/blue.  In the former case, that
+color is assigned to all the specified bond types.  In the latter
+case, the list of colors are assigned in a round-robin fashion to each
+of the specified bond types.
+
+----------
+
+The *bdiam* keyword can be used with the dump image command, with its
+*bond* keyword, when its diam setting is *type*, to set the diameter
+that bonds of each type will be drawn in the image.  The specified
+*type* should be an integer from 1 to Nbondtypes.  As with the
+*bcolor* keyword, a wildcard asterisk can be used as part of the
+*type* argument to specify a range of bond types.  The specified
+*diam* is the size in whatever distance :doc:`units <units>` you are
+using, e.g. Angstroms.
+
+----------
+
+The *bitrate* keyword can be used with the :doc:`dump movie
+<dump_image>` command to define the size of the resulting movie file
+and its quality via setting how many kbits per second are to be used
+for the movie file. Higher bitrates require less compression and will
+result in higher quality movies.  The quality is also determined by
+the compression format and encoder.  The default setting is 2000
+kbit/s, which will result in average quality with older compression
+formats.
+
+.. note::
+
+   Not all movie file formats supported by dump movie allow the
+   bitrate to be set.  If not, the setting is silently ignored.
+
+----------
+
+The *boxcolor* keyword sets the color of the simulation box drawn
+around the atoms in each image as well as the color of processor
+sub-domain boundaries.  See the "dump image box" command for how to
+specify that a box be drawn via the *box* keyword, and the sub-domain
+boundaries via the *subbox* keyword.  The color name can be any of the
+140 pre-defined colors (see below) or a color name defined by the
+dump_modify color option.
+
+----------
+
+The *color* keyword allows definition of a new color name, in addition
+to the 140-predefined colors (see below), and associates 3
+red/green/blue RGB values with that color name.  The color name can
+then be used with any other dump_modify keyword that takes a color
+name as a value.  The RGB values should each be floating point values
+between 0.0 and 1.0 inclusive.
+
+When a color name is converted to RGB values, the user-defined color
+names are searched first, then the 140 pre-defined color names.  This
+means you can also use the *color* keyword to overwrite one of the
+pre-defined color names with new RBG values.
+
+----------
+
+The *framerate* keyword can be used with the :doc:`dump movie
+<dump_image>` command to define the duration of the resulting movie
+file.  Movie files written by the dump *movie* command have a default
+frame rate of 24 frames per second and the images generated will be
+converted at that rate.  Thus a sequence of 1000 dump images will
+result in a movie of about 42 seconds.  To make a movie run longer you
+can either generate images more frequently or lower the frame rate.
+To speed a movie up, you can do the inverse.  Using a frame rate
+higher than 24 is not recommended, as it will result in simply
+dropping the rendered images. It is more efficient to dump images less
+frequently.
 
 ----------
 
@@ -664,7 +999,7 @@ Related commands
 Default
 """""""
 
-The defaults for the keywords are as follows:
+The defaults for the dump image and dump movie keywords are as follows:
 
 * adiam = not specified (use diameter setting)
 * atom = yes
@@ -682,3 +1017,101 @@ The defaults for the keywords are as follows:
 * subbox no 0.0
 * shiny = 1.0
 * ssao = no
+
+----------
+
+The defaults for the dump_modify keywords specific to dump image and dump movie are as follows:
+
+* acolor = \* red/green/blue/yellow/aqua/cyan
+* adiam = \* 1.0
+* amap = min max cf 0.0 2 min blue max red
+* backcolor = black
+* bcolor = \* red/green/blue/yellow/aqua/cyan
+* bdiam = \* 0.5
+* bitrate = 2000
+* boxcolor = yellow
+* color = 140 color names are pre-defined as listed below
+* framerate = 24
+
+----------
+
+These are the standard 109 element names that LAMMPS pre-defines for
+use with the dump image and dump_modify commands.
+
+* 1-10 = "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne"
+* 11-20 = "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca"
+* 21-30 = "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn"
+* 31-40 = "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr"
+* 41-50 = "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn"
+* 51-60 = "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd"
+* 61-70 = "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb"
+* 71-80 = "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg"
+* 81-90 = "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th"
+* 91-100 = "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm"
+* 101-109 = "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt"
+
+----------
+
+These are the 140 colors that LAMMPS pre-defines for use with the dump
+image and dump_modify commands.  Additional colors can be defined with
+the dump_modify color command.  The 3 numbers listed for each name are
+the RGB (red/green/blue) values.  Divide each value by 255 to get the
+equivalent 0.0 to 1.0 value.
+
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| aliceblue = 240, 248, 255     | antiquewhite = 250, 235, 215         | aqua = 0, 255, 255              | aquamarine = 127, 255, 212     | azure = 240, 255, 255          |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| beige = 245, 245, 220         | bisque = 255, 228, 196               | black = 0, 0, 0                 | blanchedalmond = 255, 255, 205 | blue = 0, 0, 255               |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| blueviolet = 138, 43, 226     | brown = 165, 42, 42                  | burlywood = 222, 184, 135       | cadetblue = 95, 158, 160       | chartreuse = 127, 255, 0       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| chocolate = 210, 105, 30      | coral = 255, 127, 80                 | cornflowerblue = 100, 149, 237  | cornsilk = 255, 248, 220       | crimson = 220, 20, 60          |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| cyan = 0, 255, 255            | darkblue = 0, 0, 139                 | darkcyan = 0, 139, 139          | darkgoldenrod = 184, 134, 11   | darkgray = 169, 169, 169       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| darkgreen = 0, 100, 0         | darkkhaki = 189, 183, 107            | darkmagenta = 139, 0, 139       | darkolivegreen = 85, 107, 47   | darkorange = 255, 140, 0       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| darkorchid = 153, 50, 204     | darkred = 139, 0, 0                  | darksalmon = 233, 150, 122      | darkseagreen = 143, 188, 143   | darkslateblue = 72, 61, 139    |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| darkslategray = 47, 79, 79    | darkturquoise = 0, 206, 209          | darkviolet = 148, 0, 211        | deeppink = 255, 20, 147        | deepskyblue = 0, 191, 255      |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| dimgray = 105, 105, 105       | dodgerblue = 30, 144, 255            | firebrick = 178, 34, 34         | floralwhite = 255, 250, 240    | forestgreen = 34, 139, 34      |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| fuchsia = 255, 0, 255         | gainsboro = 220, 220, 220            | ghostwhite = 248, 248, 255      | gold = 255, 215, 0             | goldenrod = 218, 165, 32       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| gray = 128, 128, 128          | green = 0, 128, 0                    | greenyellow = 173, 255, 47      | honeydew = 240, 255, 240       | hotpink = 255, 105, 180        |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| indianred = 205, 92, 92       | indigo = 75, 0, 130                  | ivory = 255, 240, 240           | khaki = 240, 230, 140          | lavender = 230, 230, 250       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| lavenderblush = 255, 240, 245 | lawngreen = 124, 252, 0              | lemonchiffon = 255, 250, 205    | lightblue = 173, 216, 230      | lightcoral = 240, 128, 128     |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| lightcyan = 224, 255, 255     | lightgoldenrodyellow = 250, 250, 210 | lightgreen = 144, 238, 144      | lightgrey = 211, 211, 211      | lightpink = 255, 182, 193      |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| lightsalmon = 255, 160, 122   | lightseagreen = 32, 178, 170         | lightskyblue = 135, 206, 250    | lightslategray = 119, 136, 153 | lightsteelblue = 176, 196, 222 |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| lightyellow = 255, 255, 224   | lime = 0, 255, 0                     | limegreen = 50, 205, 50         | linen = 250, 240, 230          | magenta = 255, 0, 255          |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| maroon = 128, 0, 0            | mediumaquamarine = 102, 205, 170     | mediumblue = 0, 0, 205          | mediumorchid = 186, 85, 211    | mediumpurple = 147, 112, 219   |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| mediumseagreen = 60, 179, 113 | mediumslateblue = 123, 104, 238      | mediumspringgreen = 0, 250, 154 | mediumturquoise = 72, 209, 204 | mediumvioletred = 199, 21, 133 |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| midnightblue = 25, 25, 112    | mintcream = 245, 255, 250            | mistyrose = 255, 228, 225       | moccasin = 255, 228, 181       | navajowhite = 255, 222, 173    |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| navy = 0, 0, 128              | oldlace = 253, 245, 230              | olive = 128, 128, 0             | olivedrab = 107, 142, 35       | orange = 255, 165, 0           |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| orangered = 255, 69, 0        | orchid = 218, 112, 214               | palegoldenrod = 238, 232, 170   | palegreen = 152, 251, 152      | paleturquoise = 175, 238, 238  |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| palevioletred = 219, 112, 147 | papayawhip = 255, 239, 213           | peachpuff = 255, 239, 213       | peru = 205, 133, 63            | pink = 255, 192, 203           |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| plum = 221, 160, 221          | powderblue = 176, 224, 230           | purple = 128, 0, 128            | red = 255, 0, 0                | rosybrown = 188, 143, 143      |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| royalblue = 65, 105, 225      | saddlebrown = 139, 69, 19            | salmon = 250, 128, 114          | sandybrown = 244, 164, 96      | seagreen = 46, 139, 87         |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| seashell = 255, 245, 238      | sienna = 160, 82, 45                 | silver = 192, 192, 192          | skyblue = 135, 206, 235        | slateblue = 106, 90, 205       |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| slategray = 112, 128, 144     | snow = 255, 250, 250                 | springgreen = 0, 255, 127       | steelblue = 70, 130, 180       | tan = 210, 180, 140            |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| teal = 0, 128, 128            | thistle = 216, 191, 216              | tomato = 253, 99, 71            | turquoise = 64, 224, 208       | violet = 238, 130, 238         |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
+| wheat = 245, 222, 179         | white = 255, 255, 255                | whitesmoke = 245, 245, 245      | yellow = 255, 255, 0           | yellowgreen = 154, 205, 50     |
++-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
diff --git a/doc/src/dump_modify.rst b/doc/src/dump_modify.rst
index 5fea976e70..be75153f6f 100644
--- a/doc/src/dump_modify.rst
+++ b/doc/src/dump_modify.rst
@@ -3,6 +3,9 @@
 dump_modify command
 ===================
 
+:doc:`dump_modify <dump_image>` command for image/movie options
+===============================================================
+
 Syntax
 """"""
 
@@ -12,8 +15,9 @@ Syntax
 
 * dump-ID = ID of dump to modify
 * one or more keyword/value pairs may be appended
+
 * these keywords apply to various dump styles
-* keyword = *append* or *at* or *buffer* or *delay* or *element* or *every* or *fileper* or *first* or *flush* or *format* or *image* or *label* or *maxfiles* or *nfile* or *pad* or *pbc* or *precision* or *region* or *refresh* or *scale* or *sfactor* or *sort* or *tfactor* or *thermo* or *thresh* or *time* or *units* or *unwrap*
+* keyword = *append* or *at* or *buffer* or *delay* or *element* or *every* or *every/time* or *fileper* or *first* or *flush* or *format* or *header* or *image* or *label* or *maxfiles* or *nfile* or *pad* or *pbc* or *precision* or *region* or *refresh* or *scale* or *sfactor* or *sort* or *tfactor* or *thermo* or *thresh* or *time* or *units* or *unwrap*
 
   .. parsed-literal::
 
@@ -28,6 +32,9 @@ Syntax
        *every* arg = N
          N = dump every this many timesteps
          N can be a variable (see below)
+       *every/time* arg = Delta
+         Delta = dump every this interval in simulation time (time units)
+         Delta can be a variable (see below)
        *fileper* arg = Np
          Np = write one file for every this many processors
        *first* arg = *yes* or *no*
@@ -35,6 +42,9 @@ Syntax
        *format* args = *line* string, *int* string, *float* string, M string, or *none*
          string = C-style format string
          M = integer from 1 to N, where N = # of per-atom quantities being output
+       *header* arg = *yes* or *no*
+         *yes* to write the header
+         *no* to not write the header
        *image* arg = *yes* or *no*
        *label* arg = string
          string = character string (e.g. BONDS) to use in header of dump local file
@@ -66,56 +76,11 @@ Syntax
        *unwrap* arg = *yes* or *no*
 
 * these keywords apply only to the *image* and *movie* :doc:`styles <dump_image>`
-* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate* or *header*
+* keyword = *acolor* or *adiam* or *amap* or *backcolor* or *bcolor* or *bdiam* or *boxcolor* or *color* or *bitrate* or *framerate*
 
   .. parsed-literal::
 
-       *acolor* args = type color
-         type = atom type or range of types (see below)
-         color = name of color or color1/color2/...
-       *adiam* args = type diam
-         type = atom type or range of types (see below)
-         diam = diameter of atoms of that type (distance units)
-       *amap* args = lo hi style delta N entry1 entry2 ... entryN
-         lo = number or *min* = lower bound of range of color map
-         hi = number or *max* = upper bound of range of color map
-         style = 2 letters = "c" or "d" or "s" plus "a" or "f"
-           "c" for continuous
-           "d" for discrete
-           "s" for sequential
-           "a" for absolute
-           "f" for fractional
-         delta = binsize (only used for style "s", otherwise ignored)
-           binsize = range is divided into bins of this width
-         N = # of subsequent entries
-         entry = value color (for continuous style)
-           value = number or *min* or *max* = single value within range
-           color = name of color used for that value
-         entry = lo hi color (for discrete style)
-           lo/hi = number or *min* or *max* = lower/upper bound of subset of range
-           color = name of color used for that subset of values
-         entry = color (for sequential style)
-           color = name of color used for a bin of values
-       *backcolor* arg = color
-         color = name of color for background
-       *bcolor* args = type color
-         type = bond type or range of types (see below)
-         color = name of color or color1/color2/...
-       *bdiam* args = type diam
-         type = bond type or range of types (see below)
-         diam = diameter of bonds of that type (distance units)
-       *boxcolor* arg = color
-         color = name of color for simulation box lines and processor sub-domain lines
-       *color* args = name R G B
-         name = name of color
-         R,G,B = red/green/blue numeric values from 0.0 to 1.0
-       *bitrate* arg = rate
-         rate = target bitrate for movie in kbps
-       *framerate* arg = fps
-         fps = frames per second for movie
-       *header* arg = *yes* or *no*
-         *yes* to write the header
-         *no* to not write the header
+       see the :doc:`dump image <dump_image>` doc page for details
 
 * these keywords apply only to the */gz* and */zstd* dump styles
 * keyword = *compression_level*
@@ -126,7 +91,7 @@ Syntax
          level = integer specifying the compression level that should be used (see below for supported levels)
 
 * these keywords apply only to the */zstd* dump styles
-* keyword = *compression_level*
+* keyword = *checksum*
 
   .. parsed-literal::
 
@@ -144,7 +109,6 @@ Examples
    dump_modify xtcdump precision 10000 sfactor 0.1
    dump_modify 1 every 1000 nfile 20
    dump_modify 1 every v_myVar
-   dump_modify 1 amap min max cf 0.0 3 min green 0.5 yellow max blue boxcolor red
 
 Description
 """""""""""
@@ -163,8 +127,9 @@ which allow for use of MPI-IO.
 
 ----------
 
-These keywords apply to various dump styles, including the :doc:`dump image <dump_image>` and :doc:`dump movie <dump_image>` styles.  The
-description gives details.
+Unless otherwise noted, the following keywords apply to all the
+various dump styles, including the :doc:`dump image <dump_image>` and
+:doc:`dump movie <dump_image>` styles.
 
 ----------
 
@@ -235,11 +200,19 @@ will be accepted.
 
 ----------
 
-The *every* keyword changes the dump frequency originally specified by
-the :doc:`dump <dump>` command to a new value.  The every keyword can be
-specified in one of two ways.  It can be a numeric value in which case
-it must be > 0.  Or it can be an :doc:`equal-style variable <variable>`,
-which should be specified as v_name, where name is the variable name.
+The *every* keyword can be used with any dump style except the *dcd*
+and *xtc* styles.  It does two things.  It specifies that the interval
+between dump snapshots will be set in timesteps, which is the default
+if the *every* or *every/time* keywords are not used.  See the
+*every/time* keyword for how to specify the interval in simulation
+time, i.e. in time units of the :doc:`units <units>` command.  The
+*every* keyword also sets the interval value, which overrides the dump
+frequency originally specified by the :doc:`dump <dump>` command.
+
+The *every* keyword can be specified in one of two ways.  It can be a
+numeric value in which case it must be > 0.  Or it can be an
+:doc:`equal-style variable <variable>`, which should be specified as
+v_name, where name is the variable name.
 
 In this case, the variable is evaluated at the beginning of a run to
 determine the next timestep at which a dump snapshot will be written
@@ -248,11 +221,12 @@ determine the next timestep, etc.  Thus the variable should return
 timestep values.  See the stagger() and logfreq() and stride() math
 functions for :doc:`equal-style variables <variable>`, as examples of
 useful functions to use in this context.  Other similar math functions
-could easily be added as options for :doc:`equal-style variables <variable>`.  Also see the next() function, which allows
-use of a file-style variable which reads successive values from a
-file, each time the variable is evaluated.  Used with the *every*
-keyword, if the file contains a list of ascending timesteps, you can
-output snapshots whenever you wish.
+could easily be added as options for :doc:`equal-style variables
+<variable>`.  Also see the next() function, which allows use of a
+file-style variable which reads successive values from a file, each
+time the variable is evaluated.  Used with the *every* keyword, if the
+file contains a list of ascending timesteps, you can output snapshots
+whenever you wish.
 
 Note that when using the variable option with the *every* keyword, you
 need to use the *first* option if you want an initial snapshot written
@@ -293,14 +267,103 @@ in file tmp.times:
 
 ----------
 
+The *every/time* keyword can be used with any dump style except the
+*dcd* and *xtc* styles.  It does two things.  It specifies that the
+interval between dump snapshots will be set in simulation time,
+i.e. in time units of the :doc:`units <units>` command.  This can be
+useful when the timestep size varies during a simulation run, e.g. by
+use of the :doc:`fix dt/reset <fix_dt_reset>` command.  The default is
+to specify the interval in timesteps; see the *every* keyword.  The
+*every/time* command also sets the interval value.
+
+.. note::
+
+   If you wish dump styles *atom*, *custom*, *local*, or *xyz* to
+   include the simulation time as a field in the header portion of
+   each snapshot, you also need to use the dump_modify *time* keyword
+   with a setting of *yes*.  See its documentation below.
+
+Note that since snapshots are output on simulation steps, each
+snapshot will be written on the first timestep whose associated
+simulation time is >= the exact snapshot time value.
+
+As with the *every* option, the *Delta* value can be specified in one
+of two ways.  It can be a numeric value in which case it must be >
+0.0.  Or it can be an :doc:`equal-style variable <variable>`, which
+should be specified as v_name, where name is the variable name.
+
+In this case, the variable is evaluated at the beginning of a run to
+determine the next simulation time at which a dump snapshot will be
+written out.  On that timestep the variable will be evaluated again to
+determine the next simulation time, etc.  Thus the variable should
+return values in time units.  Note the current timestep or simulation
+time can be used in an :doc:`equal-style variables <variable>` since
+they are both thermodynamic keywords.  Also see the next() function,
+which allows use of a file-style variable which reads successive
+values from a file, each time the variable is evaluated.  Used with
+the *every/time* keyword, if the file contains a list of ascending
+simulation times, you can output snapshots whenever you wish.
+
+Note that when using the variable option with the *every/time*
+keyword, you need to use the *first* option if you want an initial
+snapshot written to the dump file.  The *every/time* keyword cannot be
+used with the dump *dcd* style.
+
+For example, the following commands will write snapshots at successive
+simulation times which grow by a factor of 1.5 with each interval.
+The dt value used in the variable is to avoid a zero result when the
+initial simulation time is 0.0.
+
+.. code-block:: LAMMPS
+
+   variable        increase equal 1.5*(time+dt)
+   dump            1 all atom 100 tmp.dump
+   dump_modify     1 every/time v_increase first yes
+
+The following commands would write snapshots at the times listed in
+file tmp.times:
+
+.. code-block:: LAMMPS
+
+   variable        f file tmp.times
+   variable        s equal next(f)
+   dump            1 all atom 100 tmp.dump
+   dump_modify     1 every/time v_s
+
+.. note::
+
+   When using a file-style variable with the *every/time* keyword, the
+   file of timesteps must list a first time that is beyond the time
+   associated with the current timestep (e.g. it cannot be 0.0).  And
+   it must list one or more times beyond the length of the run you
+   perform.  This is because the dump command will generate an error
+   if the next time it reads from the file is not a value greater than
+   the current time.  Thus if you wanted output at times 0,15,100 of a
+   run of length 100 in simulation time, the file should contain the
+   values 15,100,101 and you should also use the dump_modify first
+   command.  Any final value > 100 could be used in place of 101.
+
+----------
+
 The *first* keyword determines whether a dump snapshot is written on
 the very first timestep after the dump command is invoked.  This will
-always occur if the current timestep is a multiple of N, the frequency
-specified in the :doc:`dump <dump>` command, including timestep 0.  But
-if this is not the case, a dump snapshot will only be written if the
-setting of this keyword is *yes*\ .  If it is *no*, which is the
+always occur if the current timestep is a multiple of $N$, the
+frequency specified in the :doc:`dump <dump>` command or
+:doc:`dump_modify every <dump_modify>` command, including timestep 0.
+It will also always occur if the current simulation time is a multiple
+of *Delta*, the time interval specified in the doc:`dump_modify
+every/time <dump_modify>` command.
+
+But if this is not the case, a dump snapshot will only be written if
+the setting of this keyword is *yes*\ .  If it is *no*, which is the
 default, then it will not be written.
 
+Note that if the argument to the :doc:`dump_modify every
+<dump_modify>` doc:`dump_modify every/time <dump_modify>` commands is
+a variable and not a numeric value, then specifying *first yes* is the
+only way to write a dump snapshot on the first timestep after the dump
+command is invoked.
+
 ----------
 
 The *flush* keyword determines whether a flush operation is invoked
@@ -380,6 +443,13 @@ The *fileper* keyword is documented below with the *nfile* keyword.
 
 ----------
 
+The *header* keyword toggles whether the dump file will include a
+header.  Excluding a header will reduce the size of the dump file for
+fixes such as :doc:`fix pair/tracker <fix_pair_tracker>` which do not
+require the information typically written to the header.
+
+----------
+
 The *image* keyword applies only to the dump *atom* style.  If the
 image value is *yes*, 3 flags are appended to each atom's coords which
 are the absolute box image of the atom in each dimension.  For
@@ -592,7 +662,9 @@ The dump *local* style cannot be sorted by atom ID, since there are
 typically multiple lines of output per atom.  Some dump styles, such
 as *dcd* and *xtc*, require sorting by atom ID to format the output
 file correctly.  If multiple processors are writing the dump file, via
-the "%" wildcard in the dump filename, then sorting cannot be
+the "%" wildcard in the dump filename and the *nfile* or *fileper*
+keywords are set to non-default values (i.e. the number of dump file
+pieces is not equal to the number of procs), then sorting cannot be
 performed.
 
 .. note::
@@ -670,16 +742,20 @@ threshold criterion is met.  Otherwise it is not met.
 
 ----------
 
-The *time* keyword only applies to the dump *atom*, *custom*, and
-*local* styles (and their COMPRESS package versions *atom/gz*,
-*custom/gz* and *local/gz*\ ). If set to *yes*, each frame will will
-contain two extra lines before the "ITEM: TIMESTEP" entry:
+The *time* keyword only applies to the dump *atom*, *custom*, *local*,
+and *xyz* styles (and their COMPRESS package versions *atom/gz*,
+*custom/gz* and *local/gz*\ ).  For the first 3 styles, if set to
+*yes*, each frame will will contain two extra lines before the "ITEM:
+TIMESTEP" entry:
 
 .. parsed-literal::
 
    ITEM: TIME
    \<elapsed time\>
 
+For the *xyz* style, the simulation time is included on the same line
+as the timestep value.
+
 This will output the current elapsed simulation time in current
 time units equivalent to the :doc:`thermo keyword <thermo_style>` *time*\ .
 This is to simplify post-processing of trajectories using a variable time
@@ -715,303 +791,35 @@ box size stored with the snapshot.
 
 ----------
 
-These keywords apply only to the :doc:`dump image <dump_image>` and
-:doc:`dump movie <dump_image>` styles.  Any keyword that affects an
-image, also affects a movie, since the movie is simply a collection of
-images.  Some of the keywords only affect the :doc:`dump movie <dump_image>` style.  The descriptions give details.
+The COMPRESS package offers both GZ and Zstd compression variants of
+styles atom, custom, local, cfg, and xyz. When using these styles the
+compression level can be controlled by the :code:`compression_level`
+keyword. File names with these styles have to end in either
+:code:`.gz` or :code:`.zst`.
 
-----------
-
-The *acolor* keyword can be used with the :doc:`dump image <dump_image>`
-command, when its atom color setting is *type*, to set the color that
-atoms of each type will be drawn in the image.
-
-The specified *type* should be an integer from 1 to Ntypes = the
-number of atom types.  A wildcard asterisk can be used in place of or
-in conjunction with the *type* argument to specify a range of atom
-types.  This takes the form "\*" or "\*n" or "n\*" or "m\*n".  If N = the
-number of atom types, then an asterisk with no numeric values means
-all types from 1 to N.  A leading asterisk means all types from 1 to n
-(inclusive).  A trailing asterisk means all types from n to N
-(inclusive).  A middle asterisk means all types from m to n
-(inclusive).
-
-The specified *color* can be a single color which is any of the 140
-pre-defined colors (see below) or a color name defined by the
-dump_modify color option.  Or it can be two or more colors separated
-by a "/" character, e.g. red/green/blue.  In the former case, that
-color is assigned to all the specified atom types.  In the latter
-case, the list of colors are assigned in a round-robin fashion to each
-of the specified atom types.
-
-----------
-
-The *adiam* keyword can be used with the :doc:`dump image <dump_image>`
-command, when its atom diameter setting is *type*, to set the size
-that atoms of each type will be drawn in the image.  The specified
-*type* should be an integer from 1 to Ntypes.  As with the *acolor*
-keyword, a wildcard asterisk can be used as part of the *type*
-argument to specify a range of atom types.  The specified *diam* is
-the size in whatever distance :doc:`units <units>` the input script is
-using, e.g. Angstroms.
-
-----------
-
-The *amap* keyword can be used with the :doc:`dump image <dump_image>`
-command, with its *atom* keyword, when its atom setting is an
-atom-attribute, to setup a color map.  The color map is used to assign
-a specific RGB (red/green/blue) color value to an individual atom when
-it is drawn, based on the atom's attribute, which is a numeric value,
-e.g. its x-component of velocity if the atom-attribute "vx" was
-specified.
-
-The basic idea of a color map is that the atom-attribute will be
-within a range of values, and that range is associated with a series
-of colors (e.g. red, blue, green).  An atom's specific value (vx =
--3.2) can then mapped to the series of colors (e.g. halfway between
-red and blue), and a specific color is determined via an interpolation
-procedure.
-
-There are many possible options for the color map, enabled by the
-*amap* keyword.  Here are the details.
-
-The *lo* and *hi* settings determine the range of values allowed for
-the atom attribute.  If numeric values are used for *lo* and/or *hi*,
-then values that are lower/higher than that value are set to the
-value.  I.e. the range is static.  If *lo* is specified as *min* or
-*hi* as *max* then the range is dynamic, and the lower and/or
-upper bound will be calculated each time an image is drawn, based
-on the set of atoms being visualized.
-
-The *style* setting is two letters, such as "ca".  The first letter is
-either "c" for continuous, "d" for discrete, or "s" for sequential.
-The second letter is either "a" for absolute, or "f" for fractional.
-
-A continuous color map is one in which the color changes continuously
-from value to value within the range.  A discrete color map is one in
-which discrete colors are assigned to sub-ranges of values within the
-range.  A sequential color map is one in which discrete colors are
-assigned to a sequence of sub-ranges of values covering the entire
-range.
-
-An absolute color map is one in which the values to which colors are
-assigned are specified explicitly as values within the range.  A
-fractional color map is one in which the values to which colors are
-assigned are specified as a fractional portion of the range.  For
-example if the range is from -10.0 to 10.0, and the color red is to be
-assigned to atoms with a value of 5.0, then for an absolute color map
-the number 5.0 would be used.  But for a fractional map, the number
-0.75 would be used since 5.0 is 3/4 of the way from -10.0 to 10.0.
-
-The *delta* setting must be specified for all styles, but is only used
-for the sequential style; otherwise the value is ignored.  It
-specifies the bin size to use within the range for assigning
-consecutive colors to.  For example, if the range is from -10.0 to
-10.0 and a *delta* of 1.0 is used, then 20 colors will be assigned to
-the range.  The first will be from -10.0 <= color1 < -9.0, then second
-from -9.0 <= color2 < -8.0, etc.
-
-The *N* setting is how many entries follow.  The format of the entries
-depends on whether the color map style is continuous, discrete or
-sequential.  In all cases the *color* setting can be any of the 140
-pre-defined colors (see below) or a color name defined by the
-dump_modify color option.
-
-For continuous color maps, each entry has a *value* and a *color*\ .
-The *value* is either a number within the range of values or *min* or
-*max*\ .  The *value* of the first entry must be *min* and the *value*
-of the last entry must be *max*\ .  Any entries in between must have
-increasing values.  Note that numeric values can be specified either
-as absolute numbers or as fractions (0.0 to 1.0) of the range,
-depending on the "a" or "f" in the style setting for the color map.
-
-Here is how the entries are used to determine the color of an
-individual atom, given the value X of its atom attribute.  X will fall
-between 2 of the entry values.  The color of the atom is linearly
-interpolated (in each of the RGB values) between the 2 colors
-associated with those entries.  For example, if X = -5.0 and the 2
-surrounding entries are "red" at -10.0 and "blue" at 0.0, then the
-atom's color will be halfway between "red" and "blue", which happens
-to be "purple".
-
-For discrete color maps, each entry has a *lo* and *hi* value and a
-*color*\ .  The *lo* and *hi* settings are either numbers within the
-range of values or *lo* can be *min* or *hi* can be *max*\ .  The *lo*
-and *hi* settings of the last entry must be *min* and *max*\ .  Other
-entries can have any *lo* and *hi* values and the sub-ranges of
-different values can overlap.  Note that numeric *lo* and *hi* values
-can be specified either as absolute numbers or as fractions (0.0 to
-1.0) of the range, depending on the "a" or "f" in the style setting
-for the color map.
-
-Here is how the entries are used to determine the color of an
-individual atom, given the value X of its atom attribute.  The entries
-are scanned from first to last.  The first time that *lo* <= X <=
-*hi*, X is assigned the color associated with that entry.  You can
-think of the last entry as assigning a default color (since it will
-always be matched by X), and the earlier entries as colors that
-override the default.  Also note that no interpolation of a color RGB
-is done.  All atoms will be drawn with one of the colors in the list
-of entries.
-
-For sequential color maps, each entry has only a *color*\ .  Here is how
-the entries are used to determine the color of an individual atom,
-given the value X of its atom attribute.  The range is partitioned
-into N bins of width *binsize*\ .  Thus X will fall in a specific bin
-from 1 to N, say the Mth bin.  If it falls on a boundary between 2
-bins, it is considered to be in the higher of the 2 bins.  Each bin is
-assigned a color from the E entries.  If E < N, then the colors are
-repeated.  For example if 2 entries with colors red and green are
-specified, then the odd numbered bins will be red and the even bins
-green.  The color of the atom is the color of its bin.  Note that the
-sequential color map is really a shorthand way of defining a discrete
-color map without having to specify where all the bin boundaries are.
-
-Here is an example of using a sequential color map to color all the
-atoms in individual molecules with a different color.  See the
-examples/pour/in.pour.2d.molecule input script for an example of how
-this is used.
-
-.. code-block:: LAMMPS
-
-   variable        colors string &
-                   "red green blue yellow white &
-                   purple pink orange lime gray"
-   variable        mol atom mol%10
-   dump            1 all image 250 image.*.jpg v_mol type &
-                   zoom 1.6 adiam 1.5
-   dump_modify     1 pad 5 amap 0 10 sa 1 10 ${colors}
-
-In this case, 10 colors are defined, and molecule IDs are
-mapped to one of the colors, even if there are 1000s of molecules.
-
-----------
-
-The *backcolor* sets the background color of the images.  The color
-name can be any of the 140 pre-defined colors (see below) or a color
-name defined by the dump_modify color option.
-
-----------
-
-The *bcolor* keyword can be used with the :doc:`dump image <dump_image>`
-command, with its *bond* keyword, when its color setting is *type*, to
-set the color that bonds of each type will be drawn in the image.
-
-The specified *type* should be an integer from 1 to Nbondtypes = the
-number of bond types.  A wildcard asterisk can be used in place of or
-in conjunction with the *type* argument to specify a range of bond
-types.  This takes the form "\*" or "\*n" or "n\*" or "m\*n".  If N = the
-number of bond types, then an asterisk with no numeric values means
-all types from 1 to N.  A leading asterisk means all types from 1 to n
-(inclusive).  A trailing asterisk means all types from n to N
-(inclusive).  A middle asterisk means all types from m to n
-(inclusive).
-
-The specified *color* can be a single color which is any of the 140
-pre-defined colors (see below) or a color name defined by the
-dump_modify color option.  Or it can be two or more colors separated
-by a "/" character, e.g. red/green/blue.  In the former case, that
-color is assigned to all the specified bond types.  In the latter
-case, the list of colors are assigned in a round-robin fashion to each
-of the specified bond types.
-
-----------
-
-The *bdiam* keyword can be used with the :doc:`dump image <dump_image>`
-command, with its *bond* keyword, when its diam setting is *type*, to
-set the diameter that bonds of each type will be drawn in the image.
-The specified *type* should be an integer from 1 to Nbondtypes.  As
-with the *bcolor* keyword, a wildcard asterisk can be used as part of
-the *type* argument to specify a range of bond types.  The specified
-*diam* is the size in whatever distance :doc:`units <units>` you are
-using, e.g. Angstroms.
-
-----------
-
-The *bitrate* keyword can be used with the :doc:`dump movie <dump_image>` command to define the size of the resulting
-movie file and its quality via setting how many kbits per second are
-to be used for the movie file. Higher bitrates require less
-compression and will result in higher quality movies.  The quality is
-also determined by the compression format and encoder.  The default
-setting is 2000 kbit/s, which will result in average quality with
-older compression formats.
-
-.. note::
-
-   Not all movie file formats supported by dump movie allow the
-   bitrate to be set.  If not, the setting is silently ignored.
-
-----------
-
-The *boxcolor* keyword sets the color of the simulation box drawn
-around the atoms in each image as well as the color of processor
-sub-domain boundaries.  See the "dump image box" command for how to
-specify that a box be drawn via the *box* keyword, and the sub-domain
-boundaries via the *subbox* keyword.  The color name can be any of the
-140 pre-defined colors (see below) or a color name defined by the
-dump_modify color option.
-
-----------
-
-The *color* keyword allows definition of a new color name, in addition
-to the 140-predefined colors (see below), and associates 3
-red/green/blue RGB values with that color name.  The color name can
-then be used with any other dump_modify keyword that takes a color
-name as a value.  The RGB values should each be floating point values
-between 0.0 and 1.0 inclusive.
-
-When a color name is converted to RGB values, the user-defined color
-names are searched first, then the 140 pre-defined color names.  This
-means you can also use the *color* keyword to overwrite one of the
-pre-defined color names with new RBG values.
-
-----------
-
-The *framerate* keyword can be used with the :doc:`dump movie <dump_image>` command to define the duration of the resulting
-movie file.  Movie files written by the dump *movie* command have a
-default frame rate of 24 frames per second and the images generated
-will be converted at that rate.  Thus a sequence of 1000 dump images
-will result in a movie of about 42 seconds.  To make a movie run
-longer you can either generate images more frequently or lower the
-frame rate.  To speed a movie up, you can do the inverse.  Using a
-frame rate higher than 24 is not recommended, as it will result in
-simply dropping the rendered images. It is more efficient to dump
-images less frequently.
-
-----------
-
-The *header* keyword toggles whether the dump file will include a header.
-Excluding a header will reduce the size of the dump file for fixes such as
-:doc:`fix pair/tracker <fix_pair_tracker>` which do not require the information
-typically written to the header.
-
-----------
-
-The COMPRESS package offers both GZ and Zstd compression variants of styles
-atom, custom, local, cfg, and xyz. When using these styles the compression
-level can be controlled by the :code:`compression_level` parameter. File names
-with these styles have to end in either :code:`.gz` or :code:`.zst`.
-
-GZ supports compression levels from -1 (default), 0 (no compression), and 1 to
-9. 9 being the best compression. The COMPRESS :code:`/gz` styles use 9 as
-default compression level.
+GZ supports compression levels from -1 (default), 0 (no compression),
+and 1 to
+9. 9 being the best compression. The COMPRESS :code:`/gz` styles use 9
+as default compression level.
 
 Zstd offers a wider range of compression levels, including negative
-levels that sacrifice compression for performance. 0 is the
-default, positive levels are 1 to 22, with 22 being the most expensive
+levels that sacrifice compression for performance. 0 is the default,
+positive levels are 1 to 22, with 22 being the most expensive
 compression. Zstd promises higher compression/decompression speeds for
 similar compression ratios. For more details see
 `http://facebook.github.io/zstd/`.
 
-In addition, Zstd compressed files can have a checksum of the entire
-contents. The Zstd enabled dump styles enable this feature by default and it
-can be disabled with the :code:`checksum` parameter.
+In addition, Zstd compressed files can include a checksum of the
+entire contents. The Zstd enabled dump styles enable this feature by
+default and it can be disabled with the :code:`checksum` keyword.
 
 ----------
 
 Restrictions
 """"""""""""
- none
+
+Not all *dump_modify* options can be applied to all dump styles.
+Details are in the discussions of the individual options.
 
 Related commands
 """"""""""""""""
@@ -1046,100 +854,7 @@ The option defaults are
 * units = no
 * unwrap = no
 
-* acolor = \* red/green/blue/yellow/aqua/cyan
-* adiam = \* 1.0
-* amap = min max cf 0.0 2 min blue max red
-* backcolor = black
-* bcolor = \* red/green/blue/yellow/aqua/cyan
-* bdiam = \* 0.5
-* bitrate = 2000
-* boxcolor = yellow
-* color = 140 color names are pre-defined as listed below
-* framerate = 24
-
 * compression_level = 9 (gz variants)
 * compression_level = 0 (zstd variants)
 * checksum = yes (zstd variants)
 
-----------
-
-These are the standard 109 element names that LAMMPS pre-defines for
-use with the :doc:`dump image <dump_image>` and dump_modify commands.
-
-* 1-10 = "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne"
-* 11-20 = "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca"
-* 21-30 = "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn"
-* 31-40 = "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr"
-* 41-50 = "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn"
-* 51-60 = "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd"
-* 61-70 = "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb"
-* 71-80 = "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg"
-* 81-90 = "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th"
-* 91-100 = "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm"
-* 101-109 = "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt"
-
-----------
-
-These are the 140 colors that LAMMPS pre-defines for use with the
-:doc:`dump image <dump_image>` and dump_modify commands.  Additional
-colors can be defined with the dump_modify color command.  The 3
-numbers listed for each name are the RGB (red/green/blue) values.
-Divide each value by 255 to get the equivalent 0.0 to 1.0 value.
-
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| aliceblue = 240, 248, 255     | antiquewhite = 250, 235, 215         | aqua = 0, 255, 255              | aquamarine = 127, 255, 212     | azure = 240, 255, 255          |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| beige = 245, 245, 220         | bisque = 255, 228, 196               | black = 0, 0, 0                 | blanchedalmond = 255, 255, 205 | blue = 0, 0, 255               |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| blueviolet = 138, 43, 226     | brown = 165, 42, 42                  | burlywood = 222, 184, 135       | cadetblue = 95, 158, 160       | chartreuse = 127, 255, 0       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| chocolate = 210, 105, 30      | coral = 255, 127, 80                 | cornflowerblue = 100, 149, 237  | cornsilk = 255, 248, 220       | crimson = 220, 20, 60          |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| cyan = 0, 255, 255            | darkblue = 0, 0, 139                 | darkcyan = 0, 139, 139          | darkgoldenrod = 184, 134, 11   | darkgray = 169, 169, 169       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| darkgreen = 0, 100, 0         | darkkhaki = 189, 183, 107            | darkmagenta = 139, 0, 139       | darkolivegreen = 85, 107, 47   | darkorange = 255, 140, 0       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| darkorchid = 153, 50, 204     | darkred = 139, 0, 0                  | darksalmon = 233, 150, 122      | darkseagreen = 143, 188, 143   | darkslateblue = 72, 61, 139    |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| darkslategray = 47, 79, 79    | darkturquoise = 0, 206, 209          | darkviolet = 148, 0, 211        | deeppink = 255, 20, 147        | deepskyblue = 0, 191, 255      |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| dimgray = 105, 105, 105       | dodgerblue = 30, 144, 255            | firebrick = 178, 34, 34         | floralwhite = 255, 250, 240    | forestgreen = 34, 139, 34      |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| fuchsia = 255, 0, 255         | gainsboro = 220, 220, 220            | ghostwhite = 248, 248, 255      | gold = 255, 215, 0             | goldenrod = 218, 165, 32       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| gray = 128, 128, 128          | green = 0, 128, 0                    | greenyellow = 173, 255, 47      | honeydew = 240, 255, 240       | hotpink = 255, 105, 180        |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| indianred = 205, 92, 92       | indigo = 75, 0, 130                  | ivory = 255, 240, 240           | khaki = 240, 230, 140          | lavender = 230, 230, 250       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| lavenderblush = 255, 240, 245 | lawngreen = 124, 252, 0              | lemonchiffon = 255, 250, 205    | lightblue = 173, 216, 230      | lightcoral = 240, 128, 128     |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| lightcyan = 224, 255, 255     | lightgoldenrodyellow = 250, 250, 210 | lightgreen = 144, 238, 144      | lightgrey = 211, 211, 211      | lightpink = 255, 182, 193      |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| lightsalmon = 255, 160, 122   | lightseagreen = 32, 178, 170         | lightskyblue = 135, 206, 250    | lightslategray = 119, 136, 153 | lightsteelblue = 176, 196, 222 |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| lightyellow = 255, 255, 224   | lime = 0, 255, 0                     | limegreen = 50, 205, 50         | linen = 250, 240, 230          | magenta = 255, 0, 255          |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| maroon = 128, 0, 0            | mediumaquamarine = 102, 205, 170     | mediumblue = 0, 0, 205          | mediumorchid = 186, 85, 211    | mediumpurple = 147, 112, 219   |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| mediumseagreen = 60, 179, 113 | mediumslateblue = 123, 104, 238      | mediumspringgreen = 0, 250, 154 | mediumturquoise = 72, 209, 204 | mediumvioletred = 199, 21, 133 |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| midnightblue = 25, 25, 112    | mintcream = 245, 255, 250            | mistyrose = 255, 228, 225       | moccasin = 255, 228, 181       | navajowhite = 255, 222, 173    |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| navy = 0, 0, 128              | oldlace = 253, 245, 230              | olive = 128, 128, 0             | olivedrab = 107, 142, 35       | orange = 255, 165, 0           |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| orangered = 255, 69, 0        | orchid = 218, 112, 214               | palegoldenrod = 238, 232, 170   | palegreen = 152, 251, 152      | paleturquoise = 175, 238, 238  |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| palevioletred = 219, 112, 147 | papayawhip = 255, 239, 213           | peachpuff = 255, 239, 213       | peru = 205, 133, 63            | pink = 255, 192, 203           |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| plum = 221, 160, 221          | powderblue = 176, 224, 230           | purple = 128, 0, 128            | red = 255, 0, 0                | rosybrown = 188, 143, 143      |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| royalblue = 65, 105, 225      | saddlebrown = 139, 69, 19            | salmon = 250, 128, 114          | sandybrown = 244, 164, 96      | seagreen = 46, 139, 87         |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| seashell = 255, 245, 238      | sienna = 160, 82, 45                 | silver = 192, 192, 192          | skyblue = 135, 206, 235        | slateblue = 106, 90, 205       |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| slategray = 112, 128, 144     | snow = 255, 250, 250                 | springgreen = 0, 255, 127       | steelblue = 70, 130, 180       | tan = 210, 180, 140            |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| teal = 0, 128, 128            | thistle = 216, 191, 216              | tomato = 253, 99, 71            | turquoise = 64, 224, 208       | violet = 238, 130, 238         |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
-| wheat = 245, 222, 179         | white = 255, 255, 255                | whitesmoke = 245, 245, 245      | yellow = 255, 255, 0           | yellowgreen = 154, 205, 50     |
-+-------------------------------+--------------------------------------+---------------------------------+--------------------------------+--------------------------------+
diff --git a/doc/src/fix_addtorque.rst b/doc/src/fix_addtorque.rst
index 4e1ca12228..73af4ae571 100644
--- a/doc/src/fix_addtorque.rst
+++ b/doc/src/fix_addtorque.rst
@@ -99,7 +99,7 @@ invoked by the :doc:`minimize <minimize>` command.
 Restrictions
 """"""""""""
 
-This fix is part of the MISC package.  It is only enabled if
+This fix is part of the EXTRA-FIX package.  It is only enabled if
 LAMMPS was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 
diff --git a/doc/src/fix_dt_reset.rst b/doc/src/fix_dt_reset.rst
index c3aa431e18..368a3dcd70 100644
--- a/doc/src/fix_dt_reset.rst
+++ b/doc/src/fix_dt_reset.rst
@@ -78,13 +78,20 @@ outer loop (largest) timestep, which is the same timestep that the
 
 Note that the cumulative simulation time (in time units), which
 accounts for changes in the timestep size as a simulation proceeds,
-can be accessed by the :doc:`thermo_style time <thermo_style>` keyword.
+can be accessed by the :doc:`thermo_style time <thermo_style>`
+keyword.
+
+Also note that the :doc:`dump_modify every/time <dump_modify>` option
+allows dump files to be written at intervals specified by simulation
+time, rather than by timesteps.  Simulation time is in time units;
+see the :doc:`units <units>` doc page for details.
 
 Restart, fix_modify, output, run start/stop, minimize info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-No information about this fix is written to :doc:`binary restart files <restart>`.  None of the :doc:`fix_modify <fix_modify>` options
-are relevant to this fix.
+No information about this fix is written to :doc:`binary restart files
+<restart>`.  None of the :doc:`fix_modify <fix_modify>` options are
+relevant to this fix.
 
 This fix computes a global scalar which can be accessed by various
 :doc:`output commands <Howto_output>`.  The scalar stores the last
@@ -93,7 +100,8 @@ timestep on which the timestep was reset to a new value.
 The scalar value calculated by this fix is "intensive".
 
 No parameter of this fix can be used with the *start/stop* keywords of
-the :doc:`run <run>` command.  This fix is not invoked during :doc:`energy minimization <minimize>`.
+the :doc:`run <run>` command.  This fix is not invoked during
+:doc:`energy minimization <minimize>`.
 
 Restrictions
 """"""""""""
@@ -102,7 +110,7 @@ Restrictions
 Related commands
 """"""""""""""""
 
-:doc:`timestep <timestep>`
+:doc:`timestep <timestep>`, :doc:`dump_modify every/time <dump_modify>`
 
 Default
 """""""
diff --git a/doc/src/fix_langevin_drude.rst b/doc/src/fix_langevin_drude.rst
index 89ea28cf08..5e62e4f416 100644
--- a/doc/src/fix_langevin_drude.rst
+++ b/doc/src/fix_langevin_drude.rst
@@ -40,7 +40,7 @@ Example input scripts available: examples/PACKAGES/drude
 Description
 """""""""""
 
-Apply two Langevin thermostats as described in :ref:`(Jiang) <Jiang1>` for
+Apply two Langevin thermostats as described in :ref:`(Jiang1) <Jiang1>` for
 thermalizing the reduced degrees of freedom of Drude oscillators.
 This link describes how to use the :doc:`thermalized Drude oscillator model <Howto_drude>` in LAMMPS and polarizable models in LAMMPS
 are discussed on the :doc:`Howto polarizable <Howto_polarizable>` doc
@@ -300,5 +300,5 @@ The option defaults are zero = no.
 
 .. _Jiang1:
 
-**(Jiang)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux, J
+**(Jiang1)** Jiang, Hardy, Phillips, MacKerell, Schulten, and Roux, J
 Phys Chem Lett, 2, 87-92 (2011).
diff --git a/doc/src/fix_oneway.rst b/doc/src/fix_oneway.rst
index f54cc42ed0..4c5afb29cf 100644
--- a/doc/src/fix_oneway.rst
+++ b/doc/src/fix_oneway.rst
@@ -51,7 +51,7 @@ the :doc:`run <run>` command.  This fix is not invoked during :doc:`energy minim
 Restrictions
 """"""""""""
 
-This fix is part of the MISC package.  It is only enabled if LAMMPS
+This fix is part of the EXTRA-FIX package.  It is only enabled if LAMMPS
 was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
 
 Related commands
diff --git a/doc/src/fix_smd.rst b/doc/src/fix_smd.rst
index 93554a4510..4c682e66c0 100644
--- a/doc/src/fix_smd.rst
+++ b/doc/src/fix_smd.rst
@@ -144,7 +144,7 @@ the :doc:`run <run>` command.  This fix is not invoked during
 Restrictions
 """"""""""""
 
-This fix is part of the MISC package.  It is only enabled if
+This fix is part of the EXTRA-FIX package.  It is only enabled if
 LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
 
 Related commands
diff --git a/doc/src/kspace_style.rst b/doc/src/kspace_style.rst
index 1dec62bb43..b6287650b9 100644
--- a/doc/src/kspace_style.rst
+++ b/doc/src/kspace_style.rst
@@ -310,7 +310,7 @@ Forschungszentrum Juelich.
 
 The library is available for download at "http://scafacos.de" or can
 be cloned from the git-repository
-"git://github.com/scafacos/scafacos.git".
+"https://github.com/scafacos/scafacos.git".
 
 In order to use this KSpace style, you must download and build the
 ScaFaCoS library, then build LAMMPS with the SCAFACOS package
diff --git a/doc/src/pair_granular.rst b/doc/src/pair_granular.rst
index b7f9da9f8b..6f84b0d9c7 100644
--- a/doc/src/pair_granular.rst
+++ b/doc/src/pair_granular.rst
@@ -205,7 +205,7 @@ For *damping mass_velocity*, the normal damping is given by:
    \eta_n = \eta_{n0} m_{eff}
 
 Here, :math:`\eta_{n0}` is the damping coefficient specified for the normal
-contact model, in units of *mass*\ /\ *time* and
+contact model, in units of 1/\ *time* and
 :math:`m_{eff} = m_i m_j/(m_i + m_j)` is the effective mass.
 Use *damping mass_velocity* to reproduce the damping behavior of
 *pair gran/hooke/\**.
diff --git a/doc/src/pair_hybrid.rst b/doc/src/pair_hybrid.rst
index 541cdc1911..1460927add 100644
--- a/doc/src/pair_hybrid.rst
+++ b/doc/src/pair_hybrid.rst
@@ -74,14 +74,17 @@ atoms interact with each other via an *eam* potential, the surface atoms
 interact with each other via a *lj/cut* potential, and the metal/surface
 interaction is also computed via a *lj/cut* potential.  The
 *hybrid/overlay* style could be used as in the second example above,
-where multiple potentials are superposed in an additive fashion to
+where multiple potentials are superimposed in an additive fashion to
 compute the interaction between atoms.  In this example, using *lj/cut*
 and *coul/long* together gives the same result as if the
 *lj/cut/coul/long* potential were used by itself.  In this case, it
 would be more efficient to use the single combined potential, but in
 general any combination of pair potentials can be used together in to
 produce an interaction that is not encoded in any single pair_style
-file, e.g. adding Coulombic forces between granular particles.
+file, e.g. adding Coulombic forces between granular particles.  Another
+limitation of using the *hybrid/overlay* variant, that it does not generate
+*lj/cut* parameters for mixed atom types from a mixing rule due to
+restrictions discussed below.
 
 If the *hybrid/scaled* style is used instead of *hybrid/overlay*,
 contributions from sub-styles are weighted by their scale factors, which
@@ -150,10 +153,14 @@ with Tersoff, and the cross-interactions with Lennard-Jones:
    pair_coeff * * tersoff 2 C.tersoff NULL C
    pair_coeff 1 2 lj/cut 1.0 1.5
 
-If pair coefficients are specified in the data file read via the
-:doc:`read_data <read_data>` command, then the same rule applies.
-E.g. "eam/alloy" or "lj/cut" must be added after the atom type, for
-each line in the "Pair Coeffs" section, e.g.
+
+It is not recommended to read pair coefficients for a hybrid style from a "Pair Coeffs"
+or "PairIJ Coeffs" section of a data file via the :doc:`read_data <read_data>` command,
+since those sections expect a fixed number of lines, either one line per atom type or
+one line pair pair of atom types, respectively.  When reading from a data file, the
+lines of the "Pair Coeffs" and "PairIJ Coeffs" are changed in the same way as the *pair_coeff*
+command, i.e. the name of the pair style to which the parameters apply must follow the
+atom type (or atom types), e.g.
 
 .. parsed-literal::
 
@@ -162,6 +169,11 @@ each line in the "Pair Coeffs" section, e.g.
    1 lj/cut/coul/cut 1.0 1.0
    ...
 
+   PairIJ Coeffs
+
+   1 1 lj/cut/coul/cut 1.0 1.0
+   ...
+
 Note that the pair_coeff command for some potentials such as
 :doc:`pair_style eam/alloy <pair_eam>` includes a mapping specification
 of elements to all atom types, which in the hybrid case, can include
@@ -208,12 +220,22 @@ examples above, or in the data file read by the :doc:`read_data
 <read_data>`, or by mixing as described below.  Also all sub-styles
 must be used at least once in a :doc:`pair_coeff <pair_coeff>` command.
 
-.. note::
+.. warning::
 
-   LAMMPS never performs mixing of parameters from different sub-styles,
-   **even** if they use the same type of coefficients, e.g. contain
-   a Lennard-Jones potential variant.  Those parameters must be provided
-   explicitly.
+   With hybrid pair styles the use of mixing to generate pair
+   coefficients is significantly limited compared to the individual pair
+   styles.  LAMMPS **never** performs mixing of parameters from
+   different sub-styles, **even** if they use the same type of
+   coefficients, e.g. contain a Lennard-Jones potential variant.  Those
+   parameters must be provided explicitly.  Also for *hybrid/overlay*
+   and *hybrid/scaled* mixing is **only** performed for pairs of atom
+   types for which only a single pair style is assigned.
+
+   Thus it is strongly recommended to provide all mixed terms
+   explicitly.  For non-hybrid styles those could be generated and
+   written out using the :doc:`write_coeff command <write_coeff>` and
+   then edited as needed to comply with the requirements for hybrid
+   styles as explained above.
 
 If you want there to be no interactions between a particular pair of
 atom types, you have 3 choices.  You can assign the pair of atom types
diff --git a/doc/src/pair_lebedeva_z.rst b/doc/src/pair_lebedeva_z.rst
index 5afd0da92c..80fe1c52cb 100644
--- a/doc/src/pair_lebedeva_z.rst
+++ b/doc/src/pair_lebedeva_z.rst
@@ -26,15 +26,29 @@ Examples
 Description
 """""""""""
 
-The *lebedeva/z* style computes the Lebedeva interaction
-potential as described in :ref:`(Lebedeva et al.) <Leb01>`. An important simplification is made,
-which is to take all normals along the z-axis.
+The *lebedeva/z* pair style computes the Lebedeva interaction potential
+as described in :ref:`(Lebedeva1) <Leb01>` and :ref:`(Lebedeva2)
+<Leb02>`.  An important simplification is made, which is to take all
+normals along the z-axis.
+
+The Lebedeva potential is intended for the description of the interlayer
+interaction between graphene layers.  To perform a realistic simulation,
+this potential must be used in combination with an intralayer potential
+such as :doc:`AIREBO <pair_airebo>` or :doc:`Tersoff <pair_tersoff>`
+facilitated by using pair style :doc:`hybrid/overlay <pair_hybrid>`.  To
+keep the intralayer properties unaffected, the interlayer interaction
+within the same layers should be avoided.  This can be achieved by
+assigning different atom types to atoms of different layers (e.g. 1 and
+2 in the examples above).
+
+Other interactions can be set to zero using pair_style *none*\ .
+
 
 .. math::
 
-   E       = & \frac{1}{2} \sum_i \sum_{i \neq j} V_{ij}\\
+   E       = & \frac{1}{2} \sum_i \sum_{j \neq i} V_{ij}\\
    V_{ij}  = & B e^{-\alpha(r_{ij} - z_0)} \\
-             & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij} e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\
+             & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij}) e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\
              & - A \left(\frac{z_0}{r_ij}\right)^6 + A \left( \frac{z_0}{r_c} \right)^6 \\
    \rho^2_{ij} = & x^2_{ij} + y^2_{ij} \qquad (\mathbf{n_i} \equiv \mathbf{\hat{z}})
 
@@ -43,12 +57,15 @@ Energies are shifted so that they go continuously to zero at the cutoff assuming
 that the exponential part of :math:`V_{ij}` (first term) decays sufficiently fast.
 This shift is achieved by the last term in the equation for :math:`V_{ij}` above.
 
-The parameter file (e.g. CC.Lebedeva), is intended for use with metal
-:doc:`units <units>`, with energies in meV. An additional parameter, *S*,
-is available to facilitate scaling of energies.
+The provided parameter file (CC.Lebedeva) contains two sets of parameters.
 
-This potential must be used in combination with hybrid/overlay.
-Other interactions can be set to zero using pair_style *none*\ .
+- The first set (element name "C") is suitable for normal conditions and
+  is taken from :ref:`(Popov1) <Popov>`
+- The second set (element name "C1") is suitable for high-pressure
+  conditions and is taken from :ref:`(Koziol1) <Koziol>`
+
+Both sets contain an additional parameter, *S*, that can be used to
+facilitate scaling of energies and is set to 1.0 by default.
 
 Restrictions
 """"""""""""
@@ -77,4 +94,16 @@ none
 
 .. _Leb01:
 
-**(Lebedeva et al.)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011)
+**(Lebedeva1)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011)
+
+.. _Leb02:
+
+**(Lebedeva2)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Physica E: 44, 949-954 (2012)
+
+.. _Popov:
+
+**(Popov1)** A.M. Popov, I. V. Lebedeva, A. A. Knizhnik, Y. E. Lozovik and B. V. Potapkin, Chem. Phys. Lett. 536, 82-86 (2012).
+
+.. _Koziol:
+
+**(Koziol1)** Z. Koziol, G. Gawlik and J. Jagielski, Chinese Phys. B 28, 096101 (2019).
diff --git a/doc/src/pair_local_density.rst b/doc/src/pair_local_density.rst
index f7e26389c3..2925ef2811 100644
--- a/doc/src/pair_local_density.rst
+++ b/doc/src/pair_local_density.rst
@@ -26,23 +26,25 @@ Examples
 Description
 """""""""""
 
-The local density (LD) potential is a mean-field manybody potential, and, in some
-sense,a generalization of embedded atom models (EAM). The name "local density
-potential" arises from the fact that it assigns an energy to an atom depending
-on the number of neighboring atoms of given type around it within a predefined
-spherical volume (i.e., within a cutoff). The bottom-up coarse-graining (CG)
-literature suggests that such potentials can be widely useful  in capturing
-effective multibody forces in a computationally efficient manner so as to
-improve the quality of CG models of implicit solvation:ref:`(Sanyal1) <Sanyal1>` and
-phase-segregation in liquid mixtures:ref:`(Sanyal2) <Sanyal2>`, and provide guidelines
-to determine the extent of manybody correlations present in a CG
-model.:ref:`(Rosenberger) <Rosenberger>` The LD potential in LAMMPS is primarily
-intended to be used as a corrective potential over traditional pair potentials
-in bottom-up CG models, i.e., as a hybrid pair style with
-other explicit pair interaction terms (e.g., table spline, Lennard Jones, etc.).
-Because the LD potential is not a pair potential per se,  it is implemented
-simply as a single auxiliary file with all specifications that will be read
-upon initialization.
+The local density (LD) potential is a mean-field manybody potential,
+and, in some way, a generalization of embedded atom models (EAM).  The
+name "local density potential" arises from the fact that it assigns an
+energy to an atom depending on the number of neighboring atoms of a
+given type around it within a predefined spherical volume (i.e., within
+the cutoff).  The bottom-up coarse-graining (CG) literature suggests
+that such potentials can be widely useful in capturing effective
+multibody forces in a computationally efficient manner and thus improve
+the quality of CG models of implicit solvation :ref:`(Sanyal1)
+<Sanyal1>` and phase-segregation in liquid mixtures :ref:`(Sanyal2)
+<Sanyal2>`, and provide guidelines to determine the extent of manybody
+correlations present in a CG model :ref:`(Rosenberger) <Rosenberger>`.
+The LD potential in LAMMPS is primarily intended to be used as a
+corrective potential over traditional pair potentials in bottom-up CG
+models via :doc:`hybrid/overlay pair style <pair_hybrid>` with other
+explicit pair interaction terms (e.g., tabulated, Lennard-Jones, Morse
+etc.).  Because the LD potential is not a pair potential per se, it is
+implemented simply as a single auxiliary file with all specifications
+that will be read upon initialization.
 
 .. note::
 
diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst
index d091496325..022de60f98 100644
--- a/doc/src/pair_meam.rst
+++ b/doc/src/pair_meam.rst
@@ -28,16 +28,16 @@ Description
    as of November 2010; see description below of the mixture_ref_t
    parameter
 
-Style *meam* computes pairwise interactions for a variety of materials
-using modified embedded-atom method (MEAM) potentials
+Pair style *meam* computes non-bonded interactions for a variety of materials
+using the modified embedded-atom method (MEAM)
 :ref:`(Baskes) <Baskes>`.  Conceptually, it is an extension to the original
-:doc:`EAM potentials <pair_eam>` which adds angular forces.  It is
+:doc:`EAM method <pair_eam>` which adds angular forces.  It is
 thus suitable for modeling metals and alloys with fcc, bcc, hcp and
-diamond cubic structures, as well as covalently bonded materials like
-silicon and carbon. Style *meam* is a translation of the (now obsolete)
-*meam* code from Fortran to C++. It is functionally equivalent to *meam*
-but more efficient, and thus *meam* has been removed from LAMMPS after
-the 12 December 2018 release.
+diamond cubic structures, as well as materials with covalent interactions
+like silicon and carbon. This *meam* pair style is a translation of the
+original Fortran version to C++. It is functionally equivalent but more
+efficient and has additional features. The Fortran version of the *meam*
+pair style has been removed from LAMMPS after the 12 December 2018 release.
 
 In the MEAM formulation, the total energy E of a system of atoms is
 given by:
diff --git a/doc/src/pair_modify.rst b/doc/src/pair_modify.rst
index 1a62a4c1a0..4941693fbd 100644
--- a/doc/src/pair_modify.rst
+++ b/doc/src/pair_modify.rst
@@ -71,21 +71,23 @@ The *mix* keyword affects pair coefficients for interactions between
 atoms of type I and J, when I != J and the coefficients are not
 explicitly set in the input script.  Note that coefficients for I = J
 must be set explicitly, either in the input script via the
-:doc:`pair_coeff <pair_coeff>` command or in the "Pair Coeffs" section of the
-:doc:`data file <read_data>`.  For some pair styles it is not
+:doc:`pair_coeff <pair_coeff>` command or in the "Pair Coeffs" or "PairIJ Coeffs"
+sections of the :doc:`data file <read_data>`.  For some pair styles it is not
 necessary to specify coefficients when I != J, since a "mixing" rule
 will create them from the I,I and J,J settings.  The pair_modify
 *mix* value determines what formulas are used to compute the mixed
 coefficients.  In each case, the cutoff distance is mixed the same way
 as sigma.
 
-Note that not all pair styles support mixing and some mix options
-are not available for certain pair styles. Also, there are additional
-restrictions when using :doc:`pair style hybrid or hybrid/overlay <pair_hybrid>`.
-See the page for individual pair styles for those restrictions.  Note also that the
-:doc:`pair_coeff <pair_coeff>` command also can be used to directly set
-coefficients for a specific I != J pairing, in which case no mixing is
-performed.
+Note that not all pair styles support mixing and some mix options are
+not available for certain pair styles. Also, there are additional
+restrictions when using :doc:`pair style hybrid or hybrid/overlay
+<pair_hybrid>`.  See the page for individual pair styles for those
+restrictions.  Note also that the :doc:`pair_coeff <pair_coeff>` command
+also can be used to directly set coefficients for a specific I != J
+pairing, in which case no mixing is performed.  If possible, LAMMPS will
+print an informational message about how many of the mixed pair
+coefficients were generated and which mixing rule was applied.
 
 - mix *geometric*
 
diff --git a/doc/src/pair_nm.rst b/doc/src/pair_nm.rst
index 2c356bb4ca..2256c7f220 100644
--- a/doc/src/pair_nm.rst
+++ b/doc/src/pair_nm.rst
@@ -1,4 +1,5 @@
 .. index:: pair_style nm/cut
+.. index:: pair_style nm/cut/split
 .. index:: pair_style nm/cut/coul/cut
 .. index:: pair_style nm/cut/coul/long
 .. index:: pair_style nm/cut/omp
@@ -10,6 +11,9 @@ pair_style nm/cut command
 
 Accelerator Variants: *nm/cut/omp*
 
+pair_style nm/cut/split command
+===============================
+
 pair_style nm/cut/coul/cut command
 ==================================
 
@@ -27,13 +31,15 @@ Syntax
 
    pair_style style args
 
-* style = *nm/cut* or *nm/cut/coul/cut* or *nm/cut/coul/long*
+* style = *nm/cut* or *nm/cut/split* or *nm/cut/coul/cut* or *nm/cut/coul/long*
 * args = list of arguments for a particular style
 
   .. parsed-literal::
 
        *nm/cut* args = cutoff
          cutoff = global cutoff for Pair interactions (distance units)
+       *nm/cut/split* args = cutoff
+         cutoff = global cutoff for Pair interactions (distance units)
        *nm/cut/coul/cut* args = cutoff (cutoff2)
          cutoff = global cutoff for Pair (and Coulombic if only 1 arg) (distance units)
          cutoff2 = global cutoff for Coulombic (optional) (distance units)
@@ -50,6 +56,10 @@ Examples
    pair_coeff * * 0.01 5.4 8.0 7.0
    pair_coeff 1 1 0.01 4.4 7.0 6.0
 
+   pair_style nm/cut/split 1.12246
+   pair_coeff 1 1 1.0 1.1246 12 6
+   pair_coeff * * 1.0 1.1246 11 6
+
    pair_style nm/cut/coul/cut 12.0 15.0
    pair_coeff * * 0.01 5.4 8.0 7.0
    pair_coeff 1 1 0.01 4.4 7.0 6.0
@@ -71,7 +81,15 @@ interaction has the following form:
    E = \frac{E_0}{(n-m)} \left[ m \left(\frac{r_0}{r}\right)^n - n
    \left(\frac{r_0}{r}\right)^m \right] \qquad r < r_c
 
-where :math:`r_c` is the cutoff.
+where :math:`r_c` is the cutoff and :math:`r_0` is the minimum of the
+potential.  Please note that this differs from the convention used for
+other Lennard-Jones potentials in LAMMPS where :math:`\sigma` represents
+the location where the energy is zero.
+
+Style *nm/cut/split* applies the standard LJ (12-6) potential above
+:math:`r_0 = 2^\frac{1}{6}\sigma`.  Style *nm/cut/split* is employed in
+polymer equilibration protocols that combine core-softening approaches
+with topology-changing moves :ref:`Dietz <Dietz>`.
 
 Style *nm/cut/coul/cut* adds a Coulombic pairwise interaction given by
 
@@ -155,7 +173,6 @@ the :doc:`run_style respa <run_style>` command.  They do not support the
 
 Restrictions
 """"""""""""
-
 These pair styles are part of the EXTRA-PAIR package.  They are only enabled if
 LAMMPS was built with that package.  See the
 :doc:`Build package <Build_package>` page for more info.
@@ -163,7 +180,7 @@ LAMMPS was built with that package.  See the
 Related commands
 """"""""""""""""
 
-:doc:`pair_coeff <pair_coeff>`
+:doc:`pair_coeff <pair_coeff>`, :doc:`pair style lj/cut <pair_lj>`, :doc:`bond style fene/nm <bond_fene>`
 
 Default
 """""""
@@ -175,3 +192,7 @@ none
 .. _Clarke:
 
 **(Clarke)** Clarke and Smith, J Chem Phys, 84, 2290 (1986).
+
+.. _Dietz:
+
+**(Dietz)** Dietz and Hoy, J. Chem Phys, 156, 014103 (2022).
diff --git a/doc/src/pair_python.rst b/doc/src/pair_python.rst
index 3d087565be..35e07dbd11 100644
--- a/doc/src/pair_python.rst
+++ b/doc/src/pair_python.rst
@@ -126,11 +126,11 @@ and *compute_energy*, which both take 3 numerical arguments:
 * itype = the (numerical) type of the first atom
 * jtype = the (numerical) type of the second atom
 
-This functions need to compute the force and the energy, respectively,
-and use the result as return value. The functions need to use the
-*pmap* dictionary to convert the LAMMPS atom type number to the symbolic
-value of the internal potential parameter data structure. Following
-the *LJCutMelt* example, here are the two functions:
+This functions need to compute the (scaled) force and the energy,
+respectively, and use the result as return value. The functions need
+to use the *pmap* dictionary to convert the LAMMPS atom type number
+to the symbolic value of the internal potential parameter data structure.
+Following the *LJCutMelt* example, here are the two functions:
 
 .. code-block:: python
 
@@ -154,10 +154,10 @@ the *LJCutMelt* example, here are the two functions:
 
    for consistency with the C++ pair styles in LAMMPS, the
    *compute_force* function follows the conventions of the Pair::single()
-   methods and does not return the full force, but the force scaled by
-   the distance between the two atoms, so this value only needs to be
-   multiplied by delta x, delta y, and delta z to conveniently obtain the
-   three components of the force vector between these two atoms.
+   methods and does not return the pairwise force directly, but the force
+   divided by the distance between the two atoms, so this value only needs
+   to be  multiplied by delta x, delta y, and delta z to conveniently obtain
+   the three components of the force vector between these two atoms.
 
 ----------
 
diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst
index 1cf033ddba..4bb3c90a8d 100644
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@@ -274,6 +274,7 @@ accelerated styles exist.
 * :doc:`nm/cut <pair_nm>` - N-M potential
 * :doc:`nm/cut/coul/cut <pair_nm>` - N-M potential with cutoff Coulomb
 * :doc:`nm/cut/coul/long <pair_nm>` - N-M potential with long-range Coulomb
+* :doc:`nm/cut/split <pair_nm>` - Split 12-6 Lennard-Jones and N-M potential
 * :doc:`oxdna/coaxstk <pair_oxdna>` -
 * :doc:`oxdna/excv <pair_oxdna>` -
 * :doc:`oxdna/hbond <pair_oxdna>` -
@@ -327,6 +328,7 @@ accelerated styles exist.
 * :doc:`spin/neel <pair_spin_neel>` -
 * :doc:`srp <pair_srp>` -
 * :doc:`sw <pair_sw>` - Stillinger-Weber 3-body potential
+* :doc:`sw/mod <pair_sw>` - modified Stillinger-Weber 3-body potential
 * :doc:`table <pair_table>` - tabulated pair potential
 * :doc:`table/rx <pair_table_rx>` -
 * :doc:`tdpd <pair_mesodpd>` - tDPD particle interactions
diff --git a/doc/src/pair_sw.rst b/doc/src/pair_sw.rst
index 1b2a4a4b1d..d71999b2d4 100644
--- a/doc/src/pair_sw.rst
+++ b/doc/src/pair_sw.rst
@@ -3,18 +3,34 @@
 .. index:: pair_style sw/intel
 .. index:: pair_style sw/kk
 .. index:: pair_style sw/omp
+.. index:: pair_style sw/mod
+.. index:: pair_style sw/mod/omp
 
 pair_style sw command
 =====================
 
 Accelerator Variants: *sw/gpu*, *sw/intel*, *sw/kk*, *sw/omp*
 
+pair_style sw/mod command
+=========================
+
+Accelerator Variants: *sw/mod/omp*
+
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
-   pair_style sw
+   pair_style style keyword values
+
+* style = *sw* or *sw/mod*
+* keyword = *maxdelcs*
+
+  .. parsed-literal::
+
+       *maxdelcs* value = delta1 delta2 (optional)
+         delta1 = The minimum thershold for cosine of three-body angle
+         delta2 = The maximum threshold for cosine of three-body angle
 
 Examples
 """"""""
@@ -25,6 +41,9 @@ Examples
    pair_coeff * * si.sw Si
    pair_coeff * * GaN.sw Ga N Ga
 
+   pair_style sw/mod maxdelcs 0.25 0.35
+   pair_coeff * * tmd.sw.mod Mo S S
+
 Description
 """""""""""
 
@@ -48,8 +67,52 @@ where :math:`\phi_2` is a two-body term and :math:`\phi_3` is a
 three-body term.  The summations in the formula are over all neighbors J
 and K of atom I within a cutoff distance :math:`a `\sigma`.
 
-Only a single pair_coeff command is used with the *sw* style which
-specifies a Stillinger-Weber potential file with parameters for all
+The *sw/mod* style is designed for simulations of materials when
+distinguishing three-body angles are necessary, such as borophene
+and transition metal dichalcogenide, which cannot be described
+by the original code for the Stillinger-Weber potential.
+For instance, there are several types of angles around each Mo atom in `MoS_2`,
+and some unnecessary angle types should be excluded in the three-body interaction.
+Such exclusion may be realized by selecting proper angle types directly.
+The exclusion of unnecessary angles is achieved here by the cut-off function (`f_C(\delta)`),
+which induces only minimum modifications for LAMMPS.
+
+Validation, benchmark tests, and applications of the *sw/mod* style
+can be found in :ref:`(Jiang2) <Jiang2>` and :ref:`(Jiang3) <Jiang3>`.
+
+The *sw/mod* style computes the energy E of a system of atoms, whose potential
+function is mostly the same as the Stillinger-Weber potential. The only modification
+is in the three-body term, where the value of :math:`\delta = \cos \theta_{ijk} - \cos \theta_{0ijk}`
+used in the original energy and force expression is scaled by a switching factor :math:`f_C(\delta)`:
+
+.. math::
+
+  f_C(\delta) & = \left\{ \begin{array} {r@{\quad:\quad}l}
+    1 & \left| \delta \right| < \delta_1 \\
+    \frac{1}{2} + \frac{1}{2} \cos \left( \pi \frac{\left| \delta \right| - \delta_1}{\delta_2 - \delta_1} \right) &
+      \delta_1 < \left| \delta \right| < \delta_2 \\
+    0 & \left| \delta \right| > \delta_2
+    \end{array} \right. \\
+
+This cut-off function decreases smoothly from 1 to 0 over the range :math:`[\delta_1, \delta_2]`.
+This smoothly turns off the energy and force contributions for :math:`\left| \delta \right| > \delta_2`.
+It is suggested that :math:`\delta 1` and :math:`\delta_2` to be the value around
+:math:`0.5 \left| \cos \theta_1 - \cos \theta_2 \right|`, with
+:math:`\theta_1` and :math:`\theta_2` as the different types of angles around an atom.
+For borophene and transition metal dichalcogenide, :math:`\delta_1 = 0.25` and :math:`\delta_2 = 0.35`.
+This value enables the cut-off function to exclude unnecessary angles in the three-body SW terms.
+
+.. note::
+
+   The cut-off function is just to be used as a technique to exclude some unnecessary angles,
+   and it has no physical meaning. It should be noted that the force and potential are inconsistent
+   with each other in the decaying range of the cut-off function, as the angle dependence for the
+   cut-off function is not implemented in the force (first derivation of potential).
+   However, the angle variation is much smaller than the given threshold value for actual simulations,
+   so the inconsistency between potential and force can be neglected in actual simulations.
+
+Only a single pair_coeff command is used with the *sw* and *sw/mod* styles
+which specifies a Stillinger-Weber potential file with parameters for all
 needed elements.  These are mapped to LAMMPS atom types by specifying
 N additional arguments after the filename in the pair_coeff command,
 where N is the number of LAMMPS atom types:
@@ -213,10 +276,19 @@ Related commands
 Default
 """""""
 
-none
+The default values for the *maxdelcs* setting of the *sw/mod* pair
+style are *delta1* = 0.25 and *delta2* = 0.35`.
 
 ----------
 
 .. _Stillinger2:
 
 **(Stillinger)** Stillinger and Weber, Phys Rev B, 31, 5262 (1985).
+
+.. _Jiang2:
+
+**(Jiang2)** J.-W. Jiang, Nanotechnology 26, 315706 (2015).
+
+.. _Jiang3:
+
+**(Jiang3)** J.-W. Jiang, Acta Mech. Solida. Sin 32, 17 (2019).
diff --git a/doc/src/pair_tersoff.rst b/doc/src/pair_tersoff.rst
index ab88806ca6..38a0262f5d 100644
--- a/doc/src/pair_tersoff.rst
+++ b/doc/src/pair_tersoff.rst
@@ -23,7 +23,7 @@ Syntax
 
    pair_style style keywords values
 
-* style = *tersoff* or *tersoff/table* or *tersoff/gpu* or *tersoff/omp* or *tersoff/table/omp*
+* style = *tersoff* or *tersoff/table*
 * keyword = *shift*
 
   .. parsed-literal::
diff --git a/doc/src/pair_thole.rst b/doc/src/pair_thole.rst
index 5a1e72f569..a4e8bbb96e 100644
--- a/doc/src/pair_thole.rst
+++ b/doc/src/pair_thole.rst
@@ -17,7 +17,7 @@ Syntax
 
    pair_style style args
 
-* style = *thole* or *lj/cut/thole/long* or *lj/cut/thole/long/omp*
+* style = *thole* or *lj/cut/thole/long*
 * args = list of arguments for a particular style
 
 .. parsed-literal::
@@ -25,7 +25,7 @@ Syntax
      *thole* args = damp cutoff
        damp = global damping parameter
        cutoff = global cutoff (distance units)
-     *lj/cut/thole/long* or *lj/cut/thole/long/omp* args = damp cutoff (cutoff2)
+     *lj/cut/thole/long* args = damp cutoff (cutoff2)
        damp = global damping parameter
        cutoff = global cutoff for LJ (and Thole if only 1 arg) (distance units)
        cutoff2 = global cutoff for Thole (optional) (distance units)
diff --git a/doc/src/pair_vashishta.rst b/doc/src/pair_vashishta.rst
index d38ac02a96..8310eb7607 100644
--- a/doc/src/pair_vashishta.rst
+++ b/doc/src/pair_vashishta.rst
@@ -22,13 +22,13 @@ Syntax
 
    pair_style style args
 
-* style = *vashishta* or *vashishta/table* or *vashishta/omp* or *vashishta/table/omp*
+* style = *vashishta* or *vashishta/table*
 * args = list of arguments for a particular style
 
 .. parsed-literal::
 
-     *vashishta* or *vashishta/omp* args = none
-     *vashishta/table* or *vashishta/table/omp* args = Ntable cutinner
+     *vashishta* args = none
+     *vashishta/table* args = Ntable cutinner
        Ntable = # of tabulation points
        cutinner = tablulate from cutinner to cutoff
 
diff --git a/doc/src/read_dump.rst b/doc/src/read_dump.rst
index c873156a38..3a771b9c2d 100644
--- a/doc/src/read_dump.rst
+++ b/doc/src/read_dump.rst
@@ -98,8 +98,7 @@ command, after the dump snapshot is read.
 ----------
 
 If the dump filename specified as *file* ends with ".gz", the dump
-file is read in gzipped format.  You cannot (yet) read a dump file
-that was written in binary format with a ".bin" suffix.
+file is read in gzipped format.
 
 You can read dump files that were written (in parallel) to multiple
 files via the "%" wild-card character in the dump file name.  If any
@@ -115,8 +114,8 @@ to tell LAMMPS how many parallel files exist, via its specified
 The format of the dump file is selected through the *format* keyword.
 If specified, it must be the last keyword used, since all remaining
 arguments are passed on to the dump reader.  The *native* format is
-for native LAMMPS dump files, written with a :doc:`dump atom <dump>` or
-:doc:`dump custom <dump>` command.  The *xyz* format is for generic XYZ
+for native LAMMPS dump files, written with a :doc:`dump atom <dump>`
+or :doc:`dump custom <dump>` command.  The *xyz* format is for generic XYZ
 formatted dump files.  These formats take no additional values.
 
 The *molfile* format supports reading data through using the `VMD <vmd_>`_
@@ -370,8 +369,6 @@ needed to generate absolute, unscaled coordinates.
 Restrictions
 """"""""""""
 
-The *native* dump file reader does not support binary .bin dump files.
-
 To read gzipped dump files, you must compile LAMMPS with the
 -DLAMMPS_GZIP option.  See the :doc:`Build settings <Build_settings>`
 doc page for details.
diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt
index 9b8e106875..9797d4d119 100644
--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@@ -1,6 +1,6 @@
 Sphinx
 sphinxcontrib-spelling
-git+git://github.com/akohlmey/sphinx-fortran@parallel-read
+git+https://github.com/akohlmey/sphinx-fortran@parallel-read
 sphinx_tabs
 breathe
 Pygments
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index d295767519..fe1e40e8ba 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -308,6 +308,7 @@ boolean
 boostostat
 boostostatting
 Boresch
+borophene
 Botero
 Botu
 Bouguet
@@ -688,8 +689,10 @@ diagonalizers
 diagonalizing
 Diallo
 diblock
+dichalcogenide
 Dickel
 diel
+Dietz
 differentiable
 diffusively
 diffusivity
@@ -1125,6 +1128,7 @@ gaussian
 gaussians
 Gaussians
 Gavhane
+Gawlik
 gayberne
 gcc
 gcmc
@@ -1308,6 +1312,7 @@ hotpink
 Houlle
 howto
 Howto
+Hoy
 Hoyt
 Hs
 hstyle
@@ -1483,6 +1488,7 @@ Izz
 Jacobsen
 Jadhao
 Jadhav
+Jagielski
 jagreat
 Jahn
 Jalalvand
@@ -1609,6 +1615,7 @@ Koslowski
 Kosovan
 Koster
 Kosztin
+Koziol
 Kp
 kradius
 Kraker
@@ -1995,6 +2002,7 @@ minimizer
 minimizers
 minneigh
 minorder
+MinSizeRel
 minSteps
 mintcream
 Mintmire
@@ -2386,6 +2394,7 @@ ohenrich
 ok
 Okabe
 Okamoto
+O'Hearn
 O'Keefe
 OKeefe
 oldlace
@@ -2784,6 +2793,7 @@ relink
 relres
 relTol
 relu
+RelWithDebInfo
 remappings
 remd
 Ren
@@ -3073,6 +3083,7 @@ snav
 Snodin
 Sodani
 Soderlind
+Solida
 solvated
 solvation
 someuser
diff --git a/examples/ELASTIC_T/potential.mod b/examples/ELASTIC_T/potential.mod
index b9ed80d865..d4b7cc7158 100644
--- a/examples/ELASTIC_T/potential.mod
+++ b/examples/ELASTIC_T/potential.mod
@@ -1,6 +1,8 @@
 # NOTE: This script can be modified for different pair styles 
 # See in.elastic for more info.
 
+# we must undefine any fix ave/* fix before using reset_timestep
+if "$(is_defined(fix,avp)" then "unfix avp"
 reset_timestep 0
 
 # Choose potential
diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table b/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table
index b0d63dbbbf..96630f0ccc 100644
--- a/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table
+++ b/examples/PACKAGES/local_density/benzene_water/benzene_water.localdensity.table
@@ -1,4 +1,4 @@
-# local density potentials: (B,B), (W,W), (B,W), (W,B)
+# local density potentials: (B,B), (W,W), (B,W), (W,B)  UNITS: real
 
 4 500
 
diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table b/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table
index 348bccfa0e..d76ac0dfd8 100644
--- a/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table
+++ b/examples/PACKAGES/local_density/benzene_water/benzene_water.pair.table
@@ -1,4 +1,4 @@
-
+# UNITS: real
 PairBB
 N 500 R  2.00000e-02  1.32500e+01
 
diff --git a/examples/PACKAGES/local_density/benzene_water/benzene_water.in b/examples/PACKAGES/local_density/benzene_water/in.benzene_water
similarity index 87%
rename from examples/PACKAGES/local_density/benzene_water/benzene_water.in
rename to examples/PACKAGES/local_density/benzene_water/in.benzene_water
index 01fb3f27e5..69d39be357 100644
--- a/examples/PACKAGES/local_density/benzene_water/benzene_water.in
+++ b/examples/PACKAGES/local_density/benzene_water/in.benzene_water
@@ -11,7 +11,7 @@
 
 # Initialize simulation box
 dimension       3
-boundary        p p p 
+boundary        p p p
 units           real
 atom_style      molecular
 
@@ -32,7 +32,7 @@ pair_coeff          *     *    local/density  benzene_water.localdensity.table
 fix recentering all recenter 0.0 0.0 0.0 units box
 
 # Thermostat & time integration
-timestep        2.0 
+timestep        2.0
 thermo          100
 thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
 
@@ -49,14 +49,14 @@ run             5000
 
 # Turn off recentering during production phase
 unfix recentering
+reset_timestep  0
 
 # Setup trajectory output
-dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element B W
-dump_modify     myDump sort id
+#dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
+#dump_modify     myDump element B W
+#dump_modify     myDump sort id
 
 # Production (for realistic results, run for 10000000 steps)
-reset_timestep  0
-run             1000             
+run             1000
 
 
diff --git a/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1 b/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1
deleted file mode 100644
index 928906edbd..0000000000
--- a/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1
+++ /dev/null
@@ -1,267 +0,0 @@
-LAMMPS (7 Aug 2019)
-# LAMMPS input file for 26.5% benzene mole fraction solution
-# with 380 benzene and 1000 water molecules,
-# using all possible local density potentials
-# between benzene and water
-#
-# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
-#
-# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
-
-
-
-# Initialize simulation box
-dimension       3
-boundary        p p p
-units           real
-atom_style      molecular
-
-# Set potential styles
-pair_style      hybrid/overlay table spline 500 local/density
-
-# Read molecule data and set initial velocities
-read_data       benzene_water.data
-  orthogonal box = (-12.865 -12.865 -64.829) to (12.865 12.865 64.829)
-  1 by 1 by 8 MPI processor grid
-  reading atoms ...
-  1380 atoms
-  0 = max # of 1-2 neighbors
-  0 = max # of 1-3 neighbors
-  0 = max # of 1-4 neighbors
-  1 = max # of special neighbors
-  special bonds CPU = 0.000566959 secs
-  read_data CPU = 0.00661397 secs
-velocity        all create  3.0000e+02 16611 rot yes dist gaussian
-
-# Assign potentials
-pair_coeff          1     1    table          benzene_water.pair.table      PairBB
-WARNING: 33 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 150 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          1     2    table          benzene_water.pair.table      PairWW
-WARNING: 61 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 90 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          2     2    table          benzene_water.pair.table      PairBW
-WARNING: 108 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 135 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          *     *    local/density  benzene_water.localdensity.table
-
-# Recentering during minimization and equilibration
-fix recentering all recenter 0.0 0.0 0.0 units box
-
-# Thermostat & time integration
-timestep        2.0
-thermo          100
-thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
-
-# Minimization
-minimize        1.e-4 0.0 10000 10000
-WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168)
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 15.25
-  ghost atom cutoff = 15.25
-  binsize = 7.625, bins = 4 4 18
-  2 neighbor lists, perpetual/occasional/extra = 2 0 0
-  (1) pair table, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-  (2) pair local/density, perpetual, copy from (1)
-      attributes: half, newton on
-      pair build: copy
-      stencil: none
-      bin: none
-Per MPI rank memory allocation (min/avg/max) = 8.061 | 8.32 | 8.674 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-         300    1233.1611    4162.3053    5395.4665            0            0            0    4162.3053 
-         300    1233.1611     2275.526    3508.6871            0            0            0     2275.526 
-Loop time of 0.352822 on 8 procs for 40 steps with 1380 atoms
-
-71.3% CPU use with 8 MPI tasks x no OpenMP threads
-
-Minimization stats:
-  Stopping criterion = linesearch alpha is zero
-  Energy initial, next-to-last, final = 
-         4162.30533361      2208.86525108      2275.52597861
-  Force two-norm initial, final = 259.364 69.3915
-  Force max component initial, final = 22.2077 8.31436
-  Final line search alpha, max atom move = 2.90022e-12 2.41135e-11
-  Iterations, force evaluations = 40 110
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.053192   | 0.23903    | 0.32779    |  17.2 | 67.75
-Bond    | 9.0599e-06 | 1.6302e-05 | 2.5272e-05 |   0.0 |  0.00
-Neigh   | 0.00044513 | 0.0023614  | 0.0063851  |   5.1 |  0.67
-Comm    | 0.015469   | 0.090432   | 0.20295    |  20.0 | 25.63
-Output  | 0          | 0          | 0          |   0.0 |  0.00
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.02098    |            |       |  5.95
-
-Nlocal:    172.5 ave 348 max 72 min
-Histogram: 5 0 0 0 0 0 0 0 1 2
-Nghost:    2193.62 ave 4352 max 932 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    9700.5 ave 20535 max 3685 min
-Histogram: 5 0 0 0 0 0 0 1 0 2
-
-Total # of neighbors = 77604
-Ave neighs/atom = 56.2348
-Ave special neighs/atom = 0
-Neighbor list builds = 2
-Dangerous builds = 0
-
-# Set up integration parameters
-fix             timeintegration all nve
-fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
-
-# Equilibration (for realistic results, run for 5000000 steps)
-reset_timestep  0
-run             5000
-WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131)
-Per MPI rank memory allocation (min/avg/max) = 6.936 | 7.195 | 7.552 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-         300    1233.1611    2866.9109    4100.0721            0            0            0    2866.9109 
-   273.33541    1123.5553    3983.2007     5106.756            0            0            0    3983.2007 
-   293.68078    1207.1857    3319.6601    4526.8458            0            0            0    3319.6601 
-   314.21462    1291.5908    3389.2178    4680.8086            0            0            0    3389.2178 
-   323.77563    1330.8917    3332.9828    4663.8745            0            0            0    3332.9828 
-    302.5902    1243.8082    3461.7692    4705.5774            0            0            0    3461.7692 
-   295.39324    1214.2249    3411.5727    4625.7976            0            0            0    3411.5727 
-   320.52341    1317.5234    3453.1931    4770.7164            0            0            0    3453.1931 
-   312.00777    1282.5195    3403.3443    4685.8638            0            0            0    3403.3443 
-   307.96774    1265.9128    3429.7809    4695.6937            0            0            0    3429.7809 
-   294.75922    1211.6187    3388.8404    4600.4591            0            0            0    3388.8404 
-   311.24567    1279.3869    3514.9603    4794.3472            0            0            0    3514.9603 
-    306.6152    1260.3531    3447.2011    4707.5542            0            0            0    3447.2011 
-   305.23306    1254.6718    3375.5092     4630.181            0            0            0    3375.5092 
-   321.62889    1322.0675    3460.2581    4782.3256            0            0            0    3460.2581 
-   316.37725    1300.4804    3437.0312    4737.5116            0            0            0    3437.0312 
-   322.90522    1327.3139    3389.1262      4716.44            0            0            0    3389.1262 
-   307.57893    1264.3146    3359.8491    4624.1637            0            0            0    3359.8491 
-   302.22607    1242.3115    3406.1711    4648.4826            0            0            0    3406.1711 
-   302.73997    1244.4239    3220.2582    4464.6821            0            0            0    3220.2582 
-   303.66194    1248.2137    3318.4629    4566.6765            0            0            0    3318.4629 
-   308.73862    1269.0815    3369.5894     4638.671            0            0            0    3369.5894 
-   315.60294    1297.2976    3411.2405    4708.5381            0            0            0    3411.2405 
-    310.0113    1274.3129    3360.1054    4634.4183            0            0            0    3360.1054 
-   302.36229    1242.8714    3326.9845    4569.8559            0            0            0    3326.9845 
-   317.78659    1306.2735    3355.4976    4661.7711            0            0            0    3355.4976 
-   302.50479    1243.4571    3317.6846    4561.1417            0            0            0    3317.6846 
-   304.29249    1250.8056    3423.5068    4674.3124            0            0            0    3423.5068 
-   305.99948    1257.8222    3432.9395    4690.7617            0            0            0    3432.9395 
-   309.93363    1273.9937     3393.657    4667.6506            0            0            0     3393.657 
-   316.14884    1299.5415    3463.0636    4762.6051            0            0            0    3463.0636 
-   300.38817    1234.7567    3309.2495    4544.0062            0            0            0    3309.2495 
-   311.05735    1278.6128    3304.4418    4583.0546            0            0            0    3304.4418 
-   311.11872     1278.865    3291.1891    4570.0542            0            0            0    3291.1891 
-   315.74338    1297.8749    3341.3063    4639.1812            0            0            0    3341.3063 
-    297.5658    1223.1552    3316.3862    4539.5414            0            0            0    3316.3862 
-   311.79033    1281.6257    3357.4556    4639.0813            0            0            0    3357.4556 
-   310.93666    1278.1167    3414.7694    4692.8861            0            0            0    3414.7694 
-   307.37298     1263.468    3337.3889    4600.8569            0            0            0    3337.3889 
-   298.84185    1228.4005    3329.6173    4558.0178            0            0            0    3329.6173 
-   310.54684    1276.5143    3351.0852    4627.5995            0            0            0    3351.0852 
-    300.0871    1233.5191    3302.2315    4535.7506            0            0            0    3302.2315 
-   304.69078    1252.4427    3324.2508    4576.6935            0            0            0    3324.2508 
-   313.50714    1288.6827    3330.4088    4619.0915            0            0            0    3330.4088 
-   329.80018    1355.6559      3301.86    4657.5159            0            0            0      3301.86 
-   304.57609    1251.9713    3365.2938    4617.2652            0            0            0    3365.2938 
-   308.73584    1269.0701    3344.4155    4613.4856            0            0            0    3344.4155 
-   306.90951    1261.5629    3304.4698    4566.0327            0            0            0    3304.4698 
-   308.85761    1269.5707    3392.1511    4661.7218            0            0            0    3392.1511 
-   302.78788    1244.6208    3317.0849    4561.7057            0            0            0    3317.0849 
-   321.68092    1322.2813    3321.5755    4643.8568            0            0            0    3321.5755 
-Loop time of 16.3061 on 8 procs for 5000 steps with 1380 atoms
-
-Performance: 52.986 ns/day, 0.453 hours/ns, 306.634 timesteps/s
-69.6% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 2.1872     | 10.542     | 14.607     | 116.7 | 64.65
-Bond    | 0.00044084 | 0.00069669 | 0.00095081 |   0.0 |  0.00
-Neigh   | 0.026948   | 0.15225    | 0.44344    |  42.0 |  0.93
-Comm    | 0.63452    | 4.2953     | 9.49       | 133.9 | 26.34
-Output  | 0.0016391  | 0.012378   | 0.050919   |  13.9 |  0.08
-Modify  | 0.45894    | 1.2107     | 4.4629     | 116.4 |  7.42
-Other   |            | 0.09292    |            |       |  0.57
-
-Nlocal:    172.5 ave 380 max 70 min
-Histogram: 5 0 0 0 0 0 0 1 1 1
-Nghost:    2213 ave 4440 max 903 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    10042.5 ave 24051 max 3500 min
-Histogram: 5 0 0 0 0 0 0 1 1 1
-
-Total # of neighbors = 80340
-Ave neighs/atom = 58.2174
-Ave special neighs/atom = 0
-Neighbor list builds = 123
-Dangerous builds = 1
-
-# Turn off recentering during production phase
-unfix recentering
-
-# Setup trajectory output
-dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element B W
-dump_modify     myDump sort id
-
-# Production (for realistic results, run for 10000000 steps)
-reset_timestep  0
-run             1000
-Per MPI rank memory allocation (min/avg/max) = 8.232 | 8.492 | 8.851 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-   321.68092    1322.2813    3784.0834    5106.3647            0            0            0    3784.0834 
-   310.59763    1276.7231    3318.3283    4595.0513            0            0            0    3318.3283 
-   303.39445    1247.1141    3324.1191    4571.2332            0            0            0    3324.1191 
-   311.37275    1279.9092    3305.0901    4584.9993            0            0            0    3305.0901 
-   311.29071     1279.572     3248.216     4527.788            0            0            0     3248.216 
-   314.53456     1292.906    3283.4563    4576.3623            0            0            0    3283.4563 
-   316.52595    1301.0916    3258.9171    4560.0087            0            0            0    3258.9171 
-   318.92447    1310.9509    3235.6256    4546.5765            0            0            0    3235.6256 
-   311.79212    1281.6331     3308.099    4589.7321            0            0            0     3308.099 
-   305.52477    1255.8709    3267.6907    4523.5616            0            0            0    3267.6907 
-   301.07457    1237.5782    3206.3997    4443.9779            0            0            0    3206.3997 
-Loop time of 4.44139 on 8 procs for 1000 steps with 1380 atoms
-
-Performance: 38.907 ns/day, 0.617 hours/ns, 225.155 timesteps/s
-60.8% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.656      | 2.5078     | 3.5775     |  57.7 | 56.46
-Bond    | 0.00013375 | 0.0001854  | 0.0002377  |   0.0 |  0.00
-Neigh   | 0.0048757  | 0.029188   | 0.090432   |  18.9 |  0.66
-Comm    | 0.51836    | 1.4427     | 2.6285     |  56.9 | 32.48
-Output  | 0.083084   | 0.089199   | 0.10333    |   2.3 |  2.01
-Modify  | 0.0087376  | 0.019705   | 0.038437   |   8.4 |  0.44
-Other   |            | 0.3526     |            |       |  7.94
-
-Nlocal:    172.5 ave 388 max 69 min
-Histogram: 5 0 0 0 0 0 0 2 0 1
-Nghost:    2207.88 ave 4429 max 896 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    10094.1 ave 24847 max 3403 min
-Histogram: 5 0 0 0 0 0 1 1 0 1
-
-Total # of neighbors = 80753
-Ave neighs/atom = 58.5167
-Ave special neighs/atom = 0
-Neighbor list builds = 23
-Dangerous builds = 0
-
-
-Total wall time: 0:00:21
diff --git a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1 b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1
new file mode 100644
index 0000000000..034b60ea67
--- /dev/null
+++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1
@@ -0,0 +1,300 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 26.5% benzene mole fraction solution
+# with 380 benzene and 1000 water molecules,
+# using all possible local density potentials
+# between benzene and water
+#
+# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
+#
+# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
+
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       benzene_water.data
+Reading data file ...
+  orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  1380 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.006 seconds
+velocity        all create  3.0000e+02 16611 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1    table          benzene_water.pair.table      PairBB
+WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 150 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBB to re-computed values (src/pair_table.cpp:473)
+pair_coeff          1     2    table          benzene_water.pair.table      PairWW
+WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 90 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairWW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          2     2    table          benzene_water.pair.table      PairBW
+WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 135 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     *    local/density  benzene_water.localdensity.table
+
+# Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+# Thermostat & time integration
+timestep        2.0
+thermo          100
+thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
+
+# Minimization
+minimize        1.e-4 0.0 10000 10000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.25
+  ghost atom cutoff = 15.25
+  binsize = 7.625, bins = 4 4 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 8.754 | 8.754 | 8.754 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    2374.6749     3607.836            0            0            0    2374.6749 
+         300    1233.1611    985.54829    2218.7094            0            0            0    985.54829 
+         300    1233.1611    962.66036    2195.8215            0            0            0    962.66036 
+Loop time of 0.812343 on 1 procs for 134 steps with 1380 atoms
+
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+      2374.67491482358   962.664796664787   962.660357218268
+  Force two-norm initial, final = 263.77519 15.741017
+  Force max component initial, final = 22.412654 7.9360139
+  Final line search alpha, max atom move = 0.014975513 0.11884588
+  Iterations, force evaluations = 134 240
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.78539    | 0.78539    | 0.78539    |   0.0 | 96.68
+Bond    | 2.0149e-05 | 2.0149e-05 | 2.0149e-05 |   0.0 |  0.00
+Neigh   | 0.016759   | 0.016759   | 0.016759   |   0.0 |  2.06
+Comm    | 0.0045     | 0.0045     | 0.0045     |   0.0 |  0.55
+Output  | 2.9402e-05 | 2.9402e-05 | 2.9402e-05 |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.005647   |            |       |  0.70
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5832.00 ave        5832 max        5832 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        78165.0 ave       78165 max       78165 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 78165
+Ave neighs/atom = 56.641304
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 5
+Dangerous builds = 0
+
+# Set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
+
+# Equilibration (for realistic results, run for 5000000 steps)
+reset_timestep  0
+run             5000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 7.629 | 7.629 | 7.629 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    962.66036    2195.8215            0            0            0    962.66036 
+    253.1913    1040.7522     1803.711    2844.4633            0            0            0     1803.711 
+   290.31049     1193.332    2059.0637    3252.3958            0            0            0    2059.0637 
+   299.30778    1230.3157     2140.226    3370.5417            0            0            0     2140.226 
+   309.81524     1273.507    2178.3782    3451.8853            0            0            0    2178.3782 
+   299.79526    1232.3195    2229.9248    3462.2444            0            0            0    2229.9248 
+   299.24909    1230.0745    2260.7129    3490.7874            0            0            0    2260.7129 
+    299.5898     1231.475    2244.2384    3475.7134            0            0            0    2244.2384 
+   297.81223    1224.1682      2320.27    3544.4382            0            0            0      2320.27 
+   301.53975    1239.4903    2277.0431    3516.5334            0            0            0    2277.0431 
+   292.00572    1200.3003    2292.3073    3492.6076            0            0            0    2292.3073 
+   309.19709    1270.9661    2303.6055    3574.5716            0            0            0    2303.6055 
+   297.54933    1223.0876     2304.127    3527.2146            0            0            0     2304.127 
+   303.48106    1247.4702    2303.5673    3551.0375            0            0            0    2303.5673 
+   296.46047    1218.6118    2256.1591    3474.7709            0            0            0    2256.1591 
+    299.4835     1231.038    2280.0452    3511.0832            0            0            0    2280.0452 
+   306.25958    1258.8914    2307.9795    3566.8709            0            0            0    2307.9795 
+   304.67335    1252.3711    2284.8252    3537.1963            0            0            0    2284.8252 
+   298.33637    1226.3227    2289.8499    3516.1726            0            0            0    2289.8499 
+    303.1338    1246.0427    2342.2148    3588.2575            0            0            0    2342.2148 
+   305.86051     1257.251    2341.0106    3598.2616            0            0            0    2341.0106 
+   297.75418    1223.9296    2303.5613    3527.4909            0            0            0    2303.5613 
+   296.79348    1219.9806    2327.5207    3547.5013            0            0            0    2327.5207 
+   307.25403    1262.9791    2288.4219     3551.401            0            0            0    2288.4219 
+   301.26976    1238.3805    2291.2465     3529.627            0            0            0    2291.2465 
+   297.17249    1221.5385    2283.3926    3504.9311            0            0            0    2283.3926 
+   313.99072    1290.6705    2293.9661    3584.6366            0            0            0    2293.9661 
+   301.70804    1240.1821    2331.1694    3571.3515            0            0            0    2331.1694 
+   300.62599    1235.7343    2325.4367     3561.171            0            0            0    2325.4367 
+   292.13495    1200.8316     2315.631    3516.4626            0            0            0     2315.631 
+    313.9981    1290.7008    2286.0536    3576.7545            0            0            0    2286.0536 
+   300.25311    1234.2015    2324.2379    3558.4394            0            0            0    2324.2379 
+    309.3746    1271.6958    2322.2298    3593.9256            0            0            0    2322.2298 
+   300.23041    1234.1082    2332.7521    3566.8603            0            0            0    2332.7521 
+   302.97054    1245.3716    2303.1689    3548.5405            0            0            0    2303.1689 
+   294.77155    1211.6694    2334.5087    3546.1781            0            0            0    2334.5087 
+   296.81476    1220.0681    2322.5932    3542.6613            0            0            0    2322.5932 
+   301.83238    1240.6932    2345.4841    3586.1773            0            0            0    2345.4841 
+    295.0399    1212.7724    2312.3889    3525.1614            0            0            0    2312.3889 
+   300.73565     1236.185    2338.8384    3575.0235            0            0            0    2338.8384 
+   303.02264    1245.5858    2310.0868    3555.6726            0            0            0    2310.0868 
+   302.86404    1244.9339    2332.2001     3577.134            0            0            0    2332.2001 
+   293.77916    1207.5901    2293.2799    3500.8701            0            0            0    2293.2799 
+   299.30072    1230.2867    2317.5065    3547.7933            0            0            0    2317.5065 
+   311.05029    1278.5837    2311.0476    3589.6313            0            0            0    2311.0476 
+   293.25646    1205.4416    2314.7398    3520.1814            0            0            0    2314.7398 
+   310.49018    1276.2814    2337.4909    3613.7723            0            0            0    2337.4909 
+   302.37336    1242.9169    2340.3197    3583.2366            0            0            0    2340.3197 
+   297.06862    1221.1116    2323.9136    3545.0252            0            0            0    2323.9136 
+   300.54817    1235.4144    2315.2405    3550.6549            0            0            0    2315.2405 
+   309.10643    1270.5934    2333.1848    3603.7783            0            0            0    2333.1848 
+Loop time of 15.2696 on 1 procs for 5000 steps with 1380 atoms
+
+Performance: 56.583 ns/day, 0.424 hours/ns, 327.447 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 14.432     | 14.432     | 14.432     |   0.0 | 94.51
+Bond    | 0.00032375 | 0.00032375 | 0.00032375 |   0.0 |  0.00
+Neigh   | 0.41541    | 0.41541    | 0.41541    |   0.0 |  2.72
+Comm    | 0.0975     | 0.0975     | 0.0975     |   0.0 |  0.64
+Output  | 0.0013044  | 0.0013044  | 0.0013044  |   0.0 |  0.01
+Modify  | 0.30336    | 0.30336    | 0.30336    |   0.0 |  1.99
+Other   |            | 0.01973    |            |       |  0.13
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5843.00 ave        5843 max        5843 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        76949.0 ave       76949 max       76949 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 76949
+Ave neighs/atom = 55.760145
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 121
+Dangerous builds = 1
+
+# Turn off recentering during production phase
+unfix recentering
+
+# Setup trajectory output
+dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element B W
+dump_modify     myDump sort id
+
+# Production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+run             1000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.022 | 9.022 | 9.022 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+   309.10643    1270.5934    2333.1848    3603.7783            0            0            0    2333.1848 
+   300.84572    1236.6375    2331.3493    3567.9868            0            0            0    2331.3493 
+   300.90599    1236.8852    2337.6775    3574.5627            0            0            0    2337.6775 
+   302.77895    1244.5841    2341.7778     3586.362            0            0            0    2341.7778 
+   291.66639    1198.9055    2320.3512    3519.2567            0            0            0    2320.3512 
+    298.7003    1227.8187    2292.8195    3520.6382            0            0            0    2292.8195 
+   301.11163    1237.7305     2310.017    3547.7475            0            0            0     2310.017 
+   305.22515    1254.6393    2315.1355    3569.7748            0            0            0    2315.1355 
+   295.15921    1213.2629     2310.184    3523.4468            0            0            0     2310.184 
+    299.2024    1229.8826    2332.2118    3562.0943            0            0            0    2332.2118 
+   302.80078    1244.6738    2320.3763    3565.0502            0            0            0    2320.3763 
+Loop time of 3.07208 on 1 procs for 1000 steps with 1380 atoms
+
+Performance: 56.249 ns/day, 0.427 hours/ns, 325.512 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 2.8993     | 2.8993     | 2.8993     |   0.0 | 94.37
+Bond    | 6.5327e-05 | 6.5327e-05 | 6.5327e-05 |   0.0 |  0.00
+Neigh   | 0.083502   | 0.083502   | 0.083502   |   0.0 |  2.72
+Comm    | 0.019967   | 0.019967   | 0.019967   |   0.0 |  0.65
+Output  | 0.012268   | 0.012268   | 0.012268   |   0.0 |  0.40
+Modify  | 0.052801   | 0.052801   | 0.052801   |   0.0 |  1.72
+Other   |            | 0.004203   |            |       |  0.14
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5860.00 ave        5860 max        5860 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        77055.0 ave       77055 max       77055 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 77055
+Ave neighs/atom = 55.836957
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 24
+Dangerous builds = 0
+
+
+Total wall time: 0:00:19
diff --git a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4 b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4
new file mode 100644
index 0000000000..f841181777
--- /dev/null
+++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4
@@ -0,0 +1,299 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 26.5% benzene mole fraction solution
+# with 380 benzene and 1000 water molecules,
+# using all possible local density potentials
+# between benzene and water
+#
+# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
+#
+# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
+
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       benzene_water.data
+Reading data file ...
+  orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000)
+  1 by 1 by 4 MPI processor grid
+  reading atoms ...
+  1380 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.007 seconds
+velocity        all create  3.0000e+02 16611 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1    table          benzene_water.pair.table      PairBB
+WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 150 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBB to re-computed values (src/pair_table.cpp:473)
+pair_coeff          1     2    table          benzene_water.pair.table      PairWW
+WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 90 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairWW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          2     2    table          benzene_water.pair.table      PairBW
+WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 135 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     *    local/density  benzene_water.localdensity.table
+
+# Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+# Thermostat & time integration
+timestep        2.0
+thermo          100
+thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
+
+# Minimization
+minimize        1.e-4 0.0 10000 10000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.25
+  ghost atom cutoff = 15.25
+  binsize = 7.625, bins = 4 4 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 8.441 | 8.589 | 8.688 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    2374.6749     3607.836            0            0            0    2374.6749 
+         300    1233.1611    1024.8113    2257.9724            0            0            0    1024.8113 
+Loop time of 0.240559 on 4 procs for 74 steps with 1380 atoms
+
+98.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+      2374.67491482358   1024.89407898645   1024.81130011575
+  Force two-norm initial, final = 263.77519 20.459697
+  Force max component initial, final = 22.412654 8.6082349
+  Final line search alpha, max atom move = 0.027790997 0.23923143
+  Iterations, force evaluations = 74 118
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.15928    | 0.1873     | 0.22814    |   6.5 | 77.86
+Bond    | 3.857e-06  | 4.4012e-06 | 5.496e-06  |   0.0 |  0.00
+Neigh   | 0.00064142 | 0.0028761  | 0.0058864  |   4.2 |  1.20
+Comm    | 0.0040776  | 0.039595   | 0.074187   |  12.6 | 16.46
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.01078    |            |       |  4.48
+
+Nlocal:        345.000 ave         664 max         147 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+Nghost:        2850.50 ave        4438 max        1208 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19377.5 ave       37718 max        7456 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+
+Total # of neighbors = 77510
+Ave neighs/atom = 56.166667
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 3
+Dangerous builds = 0
+
+# Set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
+
+# Equilibration (for realistic results, run for 5000000 steps)
+reset_timestep  0
+run             5000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 7.316 | 7.465 | 7.563 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    1024.8113    2257.9724            0            0            0    1024.8113 
+   263.61917    1083.6164     1866.745    2950.3614            0            0            0     1866.745 
+    296.0253     1216.823    2122.8463    3339.6692            0            0            0    2122.8463 
+   301.93846    1241.1292    2172.9802    3414.1095            0            0            0    2172.9802 
+    293.9491    1208.2887    2205.4892    3413.7779            0            0            0    2205.4892 
+   286.33795    1177.0027    2204.8908    3381.8935            0            0            0    2204.8908 
+   295.48217    1214.5904    2230.8849    3445.4753            0            0            0    2230.8849 
+   293.88908    1208.0419    2218.7563    3426.7982            0            0            0    2218.7563 
+   295.13798    1213.1756    2277.4515    3490.6271            0            0            0    2277.4515 
+   290.39538     1193.681    2273.4385    3467.1195            0            0            0    2273.4385 
+   297.56782    1223.1635    2268.7182    3491.8817            0            0            0    2268.7182 
+   306.45578    1259.6978    2289.1507    3548.8486            0            0            0    2289.1507 
+   308.54582     1268.289    2284.8514    3553.1404            0            0            0    2284.8514 
+   302.17353    1242.0955    2262.5577    3504.6532            0            0            0    2262.5577 
+   295.30087    1213.8452    2315.8853    3529.7305            0            0            0    2315.8853 
+   308.59197    1268.4787    2291.8314    3560.3101            0            0            0    2291.8314 
+   297.75618    1223.9378    2287.2003    3511.1381            0            0            0    2287.2003 
+   303.43395    1247.2765    2297.7158    3544.9923            0            0            0    2297.7158 
+   307.16233    1262.6021    2255.9769    3518.5791            0            0            0    2255.9769 
+   301.34428    1238.6868     2284.416    3523.1028            0            0            0     2284.416 
+   295.43209    1214.3846    2294.1043    3508.4889            0            0            0    2294.1043 
+   287.86904    1183.2963    2257.0204    3440.3168            0            0            0    2257.0204 
+    297.2661    1221.9233    2251.4194    3473.3428            0            0            0    2251.4194 
+   298.90221    1228.6486     2261.834    3490.4826            0            0            0     2261.834 
+   288.07202    1184.1307    2284.1918    3468.3225            0            0            0    2284.1918 
+   300.41201    1234.8547    2303.9573     3538.812            0            0            0    2303.9573 
+   283.91279     1167.034    2329.7936    3496.8277            0            0            0    2329.7936 
+   297.27507    1221.9602    2337.0516    3559.0118            0            0            0    2337.0516 
+   296.22263    1217.6341    2335.6424    3553.2765            0            0            0    2335.6424 
+   296.13784    1217.2856    2364.7034     3581.989            0            0            0    2364.7034 
+   308.17642    1266.7706    2320.2753    3587.0459            0            0            0    2320.2753 
+   310.26592    1275.3596    2301.9318    3577.2914            0            0            0    2301.9318 
+   292.97391    1204.2801    2289.8116    3494.0917            0            0            0    2289.8116 
+   294.81231    1211.8369    2315.0388    3526.8757            0            0            0    2315.0388 
+   298.66155    1227.6594    2317.2844    3544.9437            0            0            0    2317.2844 
+   302.77939    1244.5859    2301.2063    3545.7922            0            0            0    2301.2063 
+   291.47597    1198.1228    2285.1757    3483.2985            0            0            0    2285.1757 
+   286.19045    1176.3964    2265.2665    3441.6629            0            0            0    2265.2665 
+   295.58144    1214.9984    2272.3165     3487.315            0            0            0    2272.3165 
+   283.86988    1166.8577    2320.6142    3487.4719            0            0            0    2320.6142 
+    300.0576    1233.3979    2330.8962    3564.2941            0            0            0    2330.8962 
+   299.86413    1232.6026    2321.2281    3553.8308            0            0            0    2321.2281 
+   292.79017    1203.5248    2334.2308    3537.7557            0            0            0    2334.2308 
+    291.5027    1198.2327    2335.2119    3533.4446            0            0            0    2335.2119 
+   299.55471    1231.3307    2332.5216    3563.8524            0            0            0    2332.5216 
+   293.29613    1205.6046    2295.3263    3500.9309            0            0            0    2295.3263 
+   303.13151    1246.0333    2310.0548    3556.0881            0            0            0    2310.0548 
+   298.83954     1228.391    2297.3117    3525.7027            0            0            0    2297.3117 
+   297.44775      1222.67    2307.2483    3529.9183            0            0            0    2307.2483 
+   309.59874    1272.6171    2309.2439     3581.861            0            0            0    2309.2439 
+   307.47844    1263.9015     2274.998    3538.8995            0            0            0     2274.998 
+Loop time of 11.2235 on 4 procs for 5000 steps with 1380 atoms
+
+Performance: 76.982 ns/day, 0.312 hours/ns, 445.495 timesteps/s
+98.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 7.1444     | 8.5074     | 10.534     |  44.9 | 75.80
+Bond    | 0.00017048 | 0.00020672 | 0.00030488 |   0.0 |  0.00
+Neigh   | 0.026174   | 0.12108    | 0.26052    |  28.2 |  1.08
+Comm    | 0.21788    | 1.8597     | 3.3375     |  81.2 | 16.57
+Output  | 0.0008989  | 0.0069895  | 0.021647   |  10.2 |  0.06
+Modify  | 0.19418    | 0.7044     | 2.1378     |  98.6 |  6.28
+Other   |            | 0.02368    |            |       |  0.21
+
+Nlocal:        345.000 ave         678 max         148 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        2854.25 ave        4464 max        1181 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19366.8 ave       38533 max        7481 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+
+Total # of neighbors = 77467
+Ave neighs/atom = 56.135507
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 121
+Dangerous builds = 1
+
+# Turn off recentering during production phase
+unfix recentering
+
+# Setup trajectory output
+dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element B W
+dump_modify     myDump sort id
+
+# Production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+run             1000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 8.640 | 8.791 | 8.894 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+   307.47844    1263.9015     2274.998    3538.8995            0            0            0     2274.998 
+   309.46142    1272.0526    2274.8499    3546.9026            0            0            0    2274.8499 
+   300.70977    1236.0787    2301.0588    3537.1374            0            0            0    2301.0588 
+   300.53659    1235.3668    2316.1008    3551.4675            0            0            0    2316.1008 
+   300.48582    1235.1581    2296.3009     3531.459            0            0            0    2296.3009 
+    299.2618    1230.1267    2325.7501    3555.8768            0            0            0    2325.7501 
+   303.00905    1245.5299    2321.8238    3567.3537            0            0            0    2321.8238 
+   300.07018    1233.4496    2339.2833    3572.7329            0            0            0    2339.2833 
+   304.20292    1250.4374    2353.1018    3603.5392            0            0            0    2353.1018 
+   304.19487    1250.4043    2334.5087     3584.913            0            0            0    2334.5087 
+   294.24283    1209.4961    2335.0535    3544.5496            0            0            0    2335.0535 
+Loop time of 2.90512 on 4 procs for 1000 steps with 1380 atoms
+
+Performance: 59.481 ns/day, 0.403 hours/ns, 344.220 timesteps/s
+98.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 1.8627     | 2.2082     | 2.7289     |  22.6 | 76.01
+Bond    | 4.042e-05  | 5.3677e-05 | 8.4044e-05 |   0.0 |  0.00
+Neigh   | 0.0066184  | 0.030172   | 0.064523   |  13.9 |  1.04
+Comm    | 0.05914    | 0.51145    | 0.86887    |  40.7 | 17.61
+Output  | 0.0057814  | 0.0073478  | 0.011158   |   2.6 |  0.25
+Modify  | 0.0085337  | 0.020869   | 0.042248   |   9.4 |  0.72
+Other   |            | 0.127      |            |       |  4.37
+
+Nlocal:        345.000 ave         682 max         147 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        2836.25 ave        4427 max        1175 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19249.8 ave       38683 max        7433 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+
+Total # of neighbors = 76999
+Ave neighs/atom = 55.796377
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 23
+Dangerous builds = 0
+
+
+Total wall time: 0:00:14
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in b/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water
similarity index 86%
rename from examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in
rename to examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water
index ef92fbe655..76038b2337 100644
--- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.in
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water
@@ -1,6 +1,6 @@
 # LAMMPS input file for 50.0% methanol mole fraction solution
 # with 2500 methanol molecules in implicit water.
-# 
+#
 #
 # Author: David Rosenberger, van der Vegt Group, TU Darmstadt
 #
@@ -9,7 +9,7 @@
 
 # Initialize simulation box
 dimension       3
-boundary        p p p 
+boundary        p p p
 units           real
 atom_style      molecular
 
@@ -17,7 +17,7 @@ atom_style      molecular
 pair_style      hybrid/overlay table spline 500 local/density
 
 # Read molecule data and set initial velocities
-read_data       methanol_implicit_water.data 
+read_data       methanol_implicit_water.data
 velocity        all create  3.0000e+02 12142 rot yes dist gaussian
 
 # Assign potentials
@@ -31,7 +31,7 @@ pair_coeff          *     * local/density methanol_implicit_water.localdensity.t
 fix recentering all recenter 0.0 0.0 0.0 units box
 
 #Thermostat & time integration
-timestep        1.0 
+timestep        1.0
 thermo          100
 thermo_style    custom etotal ke pe temp evdwl
 
@@ -52,15 +52,14 @@ run             2000
 
 #turn off recentering during production run
 unfix recentering
-
+reset_timestep  0
 
 #setup trajectory output
-dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element M
-dump_modify     myDump sort id
+#dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
+#dump_modify     myDump element M
+#dump_modify     myDump sort id
 
 #run production (for realistic results, run for 10000000 steps)
-reset_timestep  0
 thermo          1000
 thermo_style    custom etotal ke pe temp  evdwl
 run             10000
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1 b/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1
deleted file mode 100644
index 618e994946..0000000000
--- a/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1
+++ /dev/null
@@ -1,226 +0,0 @@
-LAMMPS (7 Aug 2019)
-# LAMMPS input file for 50.0% methanol mole fraction solution
-# with 2500 methanol molecules in implicit water.
-#
-#
-# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
-#
-# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
-
-
-# Initialize simulation box
-dimension       3
-boundary        p p p
-units           real
-atom_style      molecular
-
-# Set potential styles
-pair_style      hybrid/overlay table spline 500 local/density
-
-# Read molecule data and set initial velocities
-read_data       methanol_implicit_water.data
-  orthogonal box = (-31.123 -31.123 -31.123) to (31.123 31.123 31.123)
-  2 by 2 by 2 MPI processor grid
-  reading atoms ...
-  2500 atoms
-  0 = max # of 1-2 neighbors
-  0 = max # of 1-3 neighbors
-  0 = max # of 1-4 neighbors
-  1 = max # of special neighbors
-  special bonds CPU = 0.00063014 secs
-  read_data CPU = 0.00599909 secs
-velocity        all create  3.0000e+02 12142 rot yes dist gaussian
-
-# Assign potentials
-pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
-WARNING: 93 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 254 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
-
-
-
-
-#Recentering during minimization and equilibration
-fix recentering all recenter 0.0 0.0 0.0 units box
-
-#Thermostat & time integration
-timestep        1.0
-thermo          100
-thermo_style    custom etotal ke pe temp evdwl
-
-#minimization
-minimize        1.e-4 0.0 1000 1000
-WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168)
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 17
-  ghost atom cutoff = 17
-  binsize = 8.5, bins = 8 8 8
-  2 neighbor lists, perpetual/occasional/extra = 2 0 0
-  (1) pair table, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-  (2) pair local/density, perpetual, copy from (1)
-      attributes: half, newton on
-      pair build: copy
-      stencil: none
-      bin: none
-Per MPI rank memory allocation (min/avg/max) = 7.411 | 7.411 | 7.412 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   1470.3564    2234.7133   -764.35689          300   -764.35689 
-   46.496766    2234.7133   -2188.2165          300   -2188.2165 
-   7.9030246    2234.7133   -2226.8103          300   -2226.8103 
-Loop time of 0.463996 on 8 procs for 121 steps with 2500 atoms
-
-91.4% CPU use with 8 MPI tasks x no OpenMP threads
-
-Minimization stats:
-  Stopping criterion = linesearch alpha is zero
-  Energy initial, next-to-last, final = 
-        -764.356892369     -2227.85589084     -2226.81026984
-  Force two-norm initial, final = 134.911 3.83896
-  Force max component initial, final = 14.1117 1.07422
-  Final line search alpha, max atom move = 5.06747e-10 5.44356e-10
-  Iterations, force evaluations = 121 154
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.41442    | 0.41976    | 0.42434    |   0.5 | 90.47
-Bond    | 1.1683e-05 | 2.0713e-05 | 3.5048e-05 |   0.0 |  0.00
-Neigh   | 0.0084722  | 0.0090862  | 0.010038   |   0.5 |  1.96
-Comm    | 0.022712   | 0.028157   | 0.034072   |   1.9 |  6.07
-Output  | 3.1948e-05 | 3.6925e-05 | 6.6996e-05 |   0.0 |  0.01
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.006937   |            |       |  1.50
-
-Nlocal:    312.5 ave 333 max 299 min
-Histogram: 2 2 0 0 1 0 2 0 0 1
-Nghost:    2546 ave 2580 max 2517 min
-Histogram: 1 1 0 3 0 1 0 0 0 2
-Neighs:    33215.4 ave 37251 max 29183 min
-Histogram: 1 0 0 1 2 2 0 1 0 1
-
-Total # of neighbors = 265723
-Ave neighs/atom = 106.289
-Ave special neighs/atom = 0
-Neighbor list builds = 6
-Dangerous builds = 0
-
-#set up integration parameters
-fix             timeintegration all nve
-fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
-
-#Equilibration (for realistic results, run for 2000000  steps)
-reset_timestep  0
-thermo          200
-thermo_style    custom etotal ke pe temp evdwl
-
-#run equilibration
-run             2000
-WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131)
-Per MPI rank memory allocation (min/avg/max) = 6.286 | 6.286 | 6.287 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   177.26822    2234.7133   -2057.4451          300   -2057.4451 
-   736.24287    2151.2608   -1415.0179    288.79688   -1415.0179 
-   963.07617    2090.6433   -1127.5671    280.65926   -1127.5671 
-   1148.9049    2173.1327   -1024.2279    291.73309   -1024.2279 
-   1303.6409    2279.8586   -976.21767    306.06055   -976.21767 
-     1355.42    2281.0383   -925.61826    306.21892   -925.61826 
-   1394.5206    2276.2093   -881.68863    305.57064   -881.68863 
-   1346.9764    2215.2973   -868.32091     297.3935   -868.32091 
-   1381.3654    2248.8061   -867.44063    301.89189   -867.44063 
-   1315.8059    2189.3193   -873.51332    293.90606   -873.51332 
-   1314.4456    2209.7431   -895.29752    296.64787   -895.29752 
-Loop time of 6.38989 on 8 procs for 2000 steps with 2500 atoms
-
-Performance: 27.043 ns/day, 0.887 hours/ns, 312.994 timesteps/s
-80.5% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 5.2693     | 5.3572     | 5.457      |   2.1 | 83.84
-Bond    | 0.00028825 | 0.00033835 | 0.00039148 |   0.0 |  0.01
-Neigh   | 0.0296     | 0.032337   | 0.035071   |   0.9 |  0.51
-Comm    | 0.64679    | 0.73397    | 0.80847    |   5.2 | 11.49
-Output  | 0.00033498 | 0.00051582 | 0.0015228  |   0.0 |  0.01
-Modify  | 0.16395    | 0.18919    | 0.21056    |   3.9 |  2.96
-Other   |            | 0.07636    |            |       |  1.19
-
-Nlocal:    312.5 ave 337 max 295 min
-Histogram: 2 2 0 1 0 0 0 1 1 1
-Nghost:    2551.62 ave 2582 max 2525 min
-Histogram: 2 1 0 0 1 1 1 0 1 1
-Neighs:    33241.8 ave 37659 max 29705 min
-Histogram: 2 0 0 2 2 0 0 0 1 1
-
-Total # of neighbors = 265934
-Ave neighs/atom = 106.374
-Ave special neighs/atom = 0
-Neighbor list builds = 21
-Dangerous builds = 0
-
-#turn off recentering during production run
-unfix recentering
-
-
-#setup trajectory output
-dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element M
-dump_modify     myDump sort id
-
-#run production (for realistic results, run for 10000000 steps)
-reset_timestep  0
-thermo          1000
-thermo_style    custom etotal ke pe temp  evdwl
-run             10000
-Per MPI rank memory allocation (min/avg/max) = 7.588 | 7.589 | 7.589 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   1442.5428    2209.7431   -767.20027    296.64787   -767.20027 
-   1391.8624    2262.6889   -870.82656     303.7556   -870.82656 
-    1375.914    2244.6176    -868.7036     301.3296    -868.7036 
-   1345.9064    2227.2324   -881.32599    298.99573   -881.32599 
-   1379.2334    2278.1156   -898.88222    305.82657   -898.88222 
-   1389.7928    2255.8062   -866.01341    302.83163   -866.01341 
-   1380.4549    2258.2108   -877.75582    303.15443   -877.75582 
-   1380.8489    2256.9432   -876.09428    302.98426   -876.09428 
-   1326.5151    2225.7408   -899.22577    298.79549   -899.22577 
-   1376.6025    2253.0128   -876.41028    302.45662   -876.41028 
-   1331.0008    2218.1033   -887.10258    297.77019   -887.10258 
-Loop time of 25.4591 on 8 procs for 10000 steps with 2500 atoms
-
-Performance: 33.937 ns/day, 0.707 hours/ns, 392.787 timesteps/s
-89.3% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 21.635     | 21.916     | 22.237     |   3.9 | 86.08
-Bond    | 0.0011308  | 0.0013149  | 0.0016932  |   0.5 |  0.01
-Neigh   | 0.14593    | 0.15675    | 0.16667    |   1.9 |  0.62
-Comm    | 1.3789     | 1.7502     | 1.9558     |  13.7 |  6.87
-Output  | 0.34664    | 0.82927    | 1.2013     |  32.8 |  3.26
-Modify  | 0.24904    | 0.25842    | 0.26907    |   1.2 |  1.02
-Other   |            | 0.5475     |            |       |  2.15
-
-Nlocal:    312.5 ave 327 max 298 min
-Histogram: 2 0 0 1 1 0 1 1 1 1
-Nghost:    2575 ave 2601 max 2559 min
-Histogram: 2 0 3 1 0 0 0 0 1 1
-Neighs:    33223.2 ave 35920 max 30303 min
-Histogram: 1 1 1 1 0 1 0 0 0 3
-
-Total # of neighbors = 265786
-Ave neighs/atom = 106.314
-Ave special neighs/atom = 0
-Neighbor list builds = 103
-Dangerous builds = 0
-
-
-Total wall time: 0:00:32
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1 b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1
new file mode 100644
index 0000000000..3048264818
--- /dev/null
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1
@@ -0,0 +1,259 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 50.0% methanol mole fraction solution
+# with 2500 methanol molecules in implicit water.
+#
+#
+# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
+#
+# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       methanol_implicit_water.data
+Reading data file ...
+  orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  2500 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.016 seconds
+velocity        all create  3.0000e+02 12142 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
+WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 254 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairMM to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
+
+
+
+
+#Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+#Thermostat & time integration
+timestep        1.0
+thermo          100
+thermo_style    custom etotal ke pe temp evdwl
+
+#minimization
+minimize        1.e-4 0.0 1000 1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 8 8 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 9.535 | 9.535 | 9.535 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1283.8556    2234.7133   -950.85771          300   -950.85771 
+  -10.187232    2234.7133   -2244.9005          300   -2244.9005 
+  -124.79406    2234.7133   -2359.5074          300   -2359.5074 
+   -126.7619    2234.7133   -2361.4752          300   -2361.4752 
+Loop time of 3.74581 on 1 procs for 205 steps with 2500 atoms
+
+99.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+     -950.857712502514  -2361.24417962983  -2361.47519428972
+  Force two-norm initial, final = 135.25170 2.8038329
+  Force max component initial, final = 14.083102 1.1154133
+  Final line search alpha, max atom move = 0.16981022 0.18940857
+  Iterations, force evaluations = 205 223
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.5678     | 3.5678     | 3.5678     |   0.0 | 95.25
+Bond    | 7.5831e-05 | 7.5831e-05 | 7.5831e-05 |   0.0 |  0.00
+Neigh   | 0.12962    | 0.12962    | 0.12962    |   0.0 |  3.46
+Comm    | 0.019204   | 0.019204   | 0.019204   |   0.0 |  0.51
+Output  | 0.00023948 | 0.00023948 | 0.00023948 |   0.0 |  0.01
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.02886    |            |       |  0.77
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6729.00 ave        6729 max        6729 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265637.0 ave      265637 max      265637 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265637
+Ave neighs/atom = 106.25480
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 11
+Dangerous builds = 0
+
+#set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
+
+#Equilibration (for realistic results, run for 2000000  steps)
+reset_timestep  0
+thermo          200
+thermo_style    custom etotal ke pe temp evdwl
+
+#run equilibration
+run             2000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 8.410 | 8.410 | 8.410 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   -126.7619    2234.7133   -2361.4752          300   -2361.4752 
+   517.05047    2015.8636   -1498.8131    270.62043   -1498.8131 
+   931.78263    2135.4332   -1203.6506     286.6721   -1203.6506 
+   1162.6209    2242.1662   -1079.5453    301.00051   -1079.5453 
+   1164.2129    2211.6204   -1047.4075    296.89989   -1047.4075 
+   1258.0085    2286.5942   -1028.5857    306.96477   -1028.5857 
+   1231.1937     2200.814   -969.62032    295.44917   -969.62032 
+   1251.2144    2245.0533   -993.83885     301.3881   -993.83885 
+   1237.2495    2239.8802   -1002.6307    300.69363   -1002.6307 
+   1232.3342    2224.3415   -992.00722    298.60763   -992.00722 
+   1235.3228     2197.191   -961.86817     294.9628   -961.86817 
+Loop time of 23.6478 on 1 procs for 2000 steps with 2500 atoms
+
+Performance: 7.307 ns/day, 3.284 hours/ns, 84.575 timesteps/s
+99.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 22.797     | 22.797     | 22.797     |   0.0 | 96.40
+Bond    | 0.00070412 | 0.00070412 | 0.00070412 |   0.0 |  0.00
+Neigh   | 0.2249     | 0.2249     | 0.2249     |   0.0 |  0.95
+Comm    | 0.12259    | 0.12259    | 0.12259    |   0.0 |  0.52
+Output  | 0.00088925 | 0.00088925 | 0.00088925 |   0.0 |  0.00
+Modify  | 0.46447    | 0.46447    | 0.46447    |   0.0 |  1.96
+Other   |            | 0.03711    |            |       |  0.16
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6752.00 ave        6752 max        6752 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265940.0 ave      265940 max      265940 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265940
+Ave neighs/atom = 106.37600
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 20
+Dangerous builds = 0
+
+#turn off recentering during production run
+unfix recentering
+
+
+#setup trajectory output
+dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element M
+dump_modify     myDump sort id
+
+#run production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+thermo          1000
+thermo_style    custom etotal ke pe temp  evdwl
+run             10000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.918 | 9.918 | 9.918 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1235.3228     2197.191   -961.86817     294.9628   -961.86817 
+   1289.8463    2236.1425   -946.29622    300.19186   -946.29622 
+   1348.0825    2305.0295   -956.94703    309.43963   -956.94703 
+   1279.5478    2241.1582   -961.61041    300.86521   -961.61041 
+   1231.8597    2201.9591   -970.09949    295.60291   -970.09949 
+   1277.3424    2221.3696   -944.02725    298.20867   -944.02725 
+   1296.0116    2222.0998   -926.08818     298.3067   -926.08818 
+   1266.2849    2206.3727   -940.08782     296.1954   -940.08782 
+   1313.2808    2260.5077   -947.22683    303.46278   -947.22683 
+   1309.3076    2234.3895   -925.08198    299.95654   -925.08198 
+   1275.9792    2221.3037   -945.32449    298.19982   -945.32449 
+Loop time of 67.3224 on 1 procs for 10000 steps with 2500 atoms
+
+Performance: 12.834 ns/day, 1.870 hours/ns, 148.539 timesteps/s
+99.4% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 64.476     | 64.476     | 64.476     |   0.0 | 95.77
+Bond    | 0.0014504  | 0.0014504  | 0.0014504  |   0.0 |  0.00
+Neigh   | 0.71333    | 0.71333    | 0.71333    |   0.0 |  1.06
+Comm    | 0.32846    | 0.32846    | 0.32846    |   0.0 |  0.49
+Output  | 0.46997    | 0.46997    | 0.46997    |   0.0 |  0.70
+Modify  | 1.2336     | 1.2336     | 1.2336     |   0.0 |  1.83
+Other   |            | 0.09996    |            |       |  0.15
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6662.00 ave        6662 max        6662 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265774.0 ave      265774 max      265774 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265774
+Ave neighs/atom = 106.30960
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 104
+Dangerous builds = 0
+
+
+Total wall time: 0:01:34
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4 b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4
new file mode 100644
index 0000000000..9467e7f9bf
--- /dev/null
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4
@@ -0,0 +1,259 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 50.0% methanol mole fraction solution
+# with 2500 methanol molecules in implicit water.
+#
+#
+# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
+#
+# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       methanol_implicit_water.data
+Reading data file ...
+  orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  2500 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.005 seconds
+velocity        all create  3.0000e+02 12142 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
+WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 254 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairMM to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
+
+
+
+
+#Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+#Thermostat & time integration
+timestep        1.0
+thermo          100
+thermo_style    custom etotal ke pe temp evdwl
+
+#minimization
+minimize        1.e-4 0.0 1000 1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 8 8 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 7.855 | 7.855 | 7.855 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1283.8556    2234.7133   -950.85771          300   -950.85771 
+  -10.187232    2234.7133   -2244.9005          300   -2244.9005 
+   -124.3661    2234.7133   -2359.0794          300   -2359.0794 
+   -146.7158    2234.7133   -2381.4291          300   -2381.4291 
+Loop time of 0.528503 on 4 procs for 244 steps with 2500 atoms
+
+99.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+     -950.857712502527   -2381.2294195605  -2381.42909821383
+  Force two-norm initial, final = 135.25170 2.3117934
+  Force max component initial, final = 14.083102 0.60833889
+  Final line search alpha, max atom move = 0.18347073 0.11161238
+  Iterations, force evaluations = 244 278
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.48518    | 0.48843    | 0.49223    |   0.4 | 92.42
+Bond    | 1.0084e-05 | 1.0861e-05 | 1.1483e-05 |   0.0 |  0.00
+Neigh   | 0.018199   | 0.019153   | 0.020036   |   0.5 |  3.62
+Comm    | 0.010229   | 0.014832   | 0.018994   |   2.6 |  2.81
+Output  | 3.7985e-05 | 4.2069e-05 | 5.3874e-05 |   0.0 |  0.01
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.006032   |            |       |  1.14
+
+Nlocal:        625.000 ave         638 max         618 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        3613.75 ave        3640 max        3580 min
+Histogram: 1 0 0 0 1 0 0 0 1 1
+Neighs:        66411.2 ave       70713 max       62416 min
+Histogram: 1 0 1 0 0 0 1 0 0 1
+
+Total # of neighbors = 265645
+Ave neighs/atom = 106.25800
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 13
+Dangerous builds = 0
+
+#set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
+
+#Equilibration (for realistic results, run for 2000000  steps)
+reset_timestep  0
+thermo          200
+thermo_style    custom etotal ke pe temp evdwl
+
+#run equilibration
+run             2000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 6.730 | 6.730 | 6.731 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   -146.7158    2234.7133   -2381.4291          300   -2381.4291 
+   540.68168      2041.44   -1500.7584    274.05395   -1500.7584 
+    945.4949    2163.7509    -1218.256    290.47363    -1218.256 
+   1118.7729    2195.7579    -1076.985    294.77042    -1076.985 
+   1215.0058    2233.2445   -1018.2387    299.80282   -1018.2387 
+   1251.8045    2240.8439   -989.03944      300.823   -989.03944 
+    1206.649    2149.5807   -942.93169    288.57134   -942.93169 
+   1290.6111    2248.3623   -957.75117    301.83231   -957.75117 
+   1312.8944     2219.147   -906.25264     297.9103   -906.25264 
+    1260.002    2211.4176   -951.41561    296.87266   -951.41561 
+   1335.0956    2270.1367   -935.04108    304.75543   -935.04108 
+Loop time of 3.56721 on 4 procs for 2000 steps with 2500 atoms
+
+Performance: 48.441 ns/day, 0.495 hours/ns, 560.663 timesteps/s
+99.8% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.3122     | 3.3399     | 3.3633     |   1.0 | 93.63
+Bond    | 7.5941e-05 | 8.062e-05  | 8.7627e-05 |   0.0 |  0.00
+Neigh   | 0.03524    | 0.036666   | 0.037864   |   0.6 |  1.03
+Comm    | 0.080116   | 0.10444    | 0.13373    |   6.1 |  2.93
+Output  | 0.00019977 | 0.00022502 | 0.00029007 |   0.0 |  0.01
+Modify  | 0.077781   | 0.078206   | 0.078752   |   0.1 |  2.19
+Other   |            | 0.007641   |            |       |  0.21
+
+Nlocal:        625.000 ave         637 max         616 min
+Histogram: 1 0 1 0 1 0 0 0 0 1
+Nghost:        3597.25 ave        3610 max        3586 min
+Histogram: 1 0 1 0 0 0 1 0 0 1
+Neighs:        66468.2 ave       69230 max       62721 min
+Histogram: 1 0 0 1 0 0 0 0 0 2
+
+Total # of neighbors = 265873
+Ave neighs/atom = 106.34920
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 20
+Dangerous builds = 0
+
+#turn off recentering during production run
+unfix recentering
+
+
+#setup trajectory output
+dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element M
+dump_modify     myDump sort id
+
+#run production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+thermo          1000
+thermo_style    custom etotal ke pe temp  evdwl
+run             10000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 8.071 | 8.071 | 8.071 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1335.0956    2270.1367   -935.04108    304.75543   -935.04108 
+   1266.2305    2227.2123   -960.98186    298.99303   -960.98186 
+   1304.2289    2238.1343   -933.90544    300.45925   -933.90544 
+   1311.3201    2232.0862    -920.7661    299.64733    -920.7661 
+   1289.9028    2241.3533   -951.45049    300.89139   -951.45049 
+   1314.2234    2244.8514   -930.62797      301.361   -930.62797 
+   1282.2744    2240.6716   -958.39719    300.79987   -958.39719 
+    1239.302    2181.5711    -942.2691    292.86591    -942.2691 
+   1327.0954    2242.6441   -915.54875    301.06468   -915.54875 
+   1334.9799    2239.6841   -904.70423    300.66731   -904.70423 
+   1320.6105    2263.4912   -942.88066     303.8633   -942.88066 
+Loop time of 23.3399 on 4 procs for 10000 steps with 2500 atoms
+
+Performance: 37.018 ns/day, 0.648 hours/ns, 428.451 timesteps/s
+99.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 21.343     | 21.606     | 21.766     |   3.7 | 92.57
+Bond    | 0.00045963 | 0.0004817  | 0.0005083  |   0.0 |  0.00
+Neigh   | 0.20708    | 0.22081    | 0.22733    |   1.7 |  0.95
+Comm    | 0.63014    | 0.80326    | 1.0801     |  19.8 |  3.44
+Output  | 0.11791    | 0.14443    | 0.22211    |  11.8 |  0.62
+Modify  | 0.37291    | 0.389      | 0.41719    |   2.7 |  1.67
+Other   |            | 0.1761     |            |       |  0.75
+
+Nlocal:        625.000 ave         636 max         613 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Nghost:        3597.00 ave        3613 max        3580 min
+Histogram: 1 0 0 1 0 0 0 1 0 1
+Neighs:        66408.5 ave       69186 max       61728 min
+Histogram: 1 0 0 0 0 0 1 0 1 1
+
+Total # of neighbors = 265634
+Ave neighs/atom = 106.25360
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 102
+Dangerous builds = 0
+
+
+Total wall time: 0:00:27
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table
index b9b4a082bc..af2d4304f7 100644
--- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.localdensity.table
@@ -1,4 +1,4 @@
-#LOCAL DENSITY POTENTIALS
+#LOCAL DENSITY POTENTIALS  UNITS: real
 
 1 500
 
diff --git a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table
index b74fe398e8..6ec4a0a762 100644
--- a/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/methanol_implicit_water.pair.table
@@ -1,4 +1,4 @@
-
+# UNITS: real
 PairMM
 N 500 R  2.00000e-02  1.50000e+01
 
diff --git a/examples/plugins/CMakeLists.txt b/examples/plugins/CMakeLists.txt
index 0ca2c025e2..8bef055ad3 100644
--- a/examples/plugins/CMakeLists.txt
+++ b/examples/plugins/CMakeLists.txt
@@ -14,26 +14,29 @@ endif()
 
 project(plugins VERSION 1.0 LANGUAGES CXX)
 
-# ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro
-# and prints lots of pointless warnings about "unsafe" functions
-if(MSVC)
-  add_compile_options(/Zc:__cplusplus)
-  add_compile_options(/wd4244)
-  add_compile_options(/wd4267)
-  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-endif()
-
-# NOTE: the next line should be commented out when used outside of the LAMMPS package
-get_filename_component(LAMMPS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../src ABSOLUTE)
-set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR} CACHE PATH "Location of LAMMPS headers")
-if(NOT LAMMPS_HEADER_DIR)
-  message(FATAL_ERROR "Must set LAMMPS_HEADER_DIR")
-endif()
-
-# by default, install into $HOME/.local (not /usr/local),
-# so that no root access (and sudo) is needed
-if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-  set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.local" CACHE PATH "Default install path" FORCE)
+# when this file is included as subdirectory in the LAMMPS build, many settings are directly imported
+if(LAMMPS_DIR)
+  set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR})
+else()
+  # NOTE: the next line should be commented out when used outside of the LAMMPS package
+  get_filename_component(LAMMPS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../src ABSOLUTE)
+  set(LAMMPS_HEADER_DIR ${LAMMPS_SOURCE_DIR} CACHE PATH "Location of LAMMPS headers")
+  if(NOT LAMMPS_HEADER_DIR)
+    message(FATAL_ERROR "Must set LAMMPS_HEADER_DIR")
+  endif()
+  # by default, install into $HOME/.local (not /usr/local),
+  # so that no root access (and sudo) is needed
+  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.local" CACHE PATH "Default install path" FORCE)
+  endif()
+  # ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro
+  # and prints lots of pointless warnings about "unsafe" functions
+  if(MSVC)
+    add_compile_options(/Zc:__cplusplus)
+    add_compile_options(/wd4244)
+    add_compile_options(/wd4267)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+  endif()
 endif()
 
 # C++11 is required
@@ -45,9 +48,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
 endif()
 
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR})
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
 include(CheckIncludeFileCXX)
-include(LAMMPSInterfaceCXX)
+if(NOT LAMMPS_DIR)
+  include(LAMMPSInterfaceCXX)
+endif()
 
 ##########################
 # building the plugins
@@ -66,7 +71,7 @@ add_library(zero2plugin MODULE zero2plugin.cpp pair_zero2.cpp bond_zero2.cpp
                                angle_zero2.cpp dihedral_zero2.cpp improper_zero2.cpp)
 target_link_libraries(zero2plugin PRIVATE lammps)
 
-set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES PREFIX "")
+set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES PREFIX "" SUFFIX ".so")
 
 # MacOS seems to need this
 if(CMAKE_SYSTEM_NAME STREQUAL Darwin)
@@ -84,3 +89,6 @@ else()
   set_target_properties(morse2plugin nve2plugin helloplugin zero2plugin PROPERTIES
     LINK_FLAGS "-rdynamic")
 endif()
+
+add_custom_target(plugins ALL ${CMAKE_COMMAND} -E echo "Building Plugins"
+  DEPENDS morse2plugin nve2plugin helloplugin zero2plugin morse2plugin)
diff --git a/examples/threebody/in.mos2.sw.mod b/examples/threebody/in.mos2.sw.mod
new file mode 100644
index 0000000000..d5ddd356e0
--- /dev/null
+++ b/examples/threebody/in.mos2.sw.mod
@@ -0,0 +1,30 @@
+# monolayer MoS2
+units      	metal
+boundary   	p p f
+processors	* * 1
+
+atom_style 	atomic
+read_data       single_layer_MoS2.data
+
+mass            * 32.065	# mass of sulphur atom , uint: a.u.=1.66X10^(-27)kg
+mass            1 95.94		# mass of molebdenum atom , uint: a.u.=1.66X10^(-27)kg
+
+########################## Define potentials ################################
+pair_style      sw/mod maxdelcs 0.25 0.35
+pair_coeff      * * tmd.sw.mod Mo S S
+#########################################################################
+
+### Simulation settings ####
+timestep        0.001
+velocity        all create 300.0 12345
+
+############################
+
+# Output
+thermo          500
+thermo_style    custom step etotal pe ke temp
+thermo_modify   lost warn
+
+###### Run molecular dynamics ######
+fix             thermostat all nve
+run             5000
diff --git a/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1 b/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1
new file mode 100644
index 0000000000..4dda8e9d1c
--- /dev/null
+++ b/examples/threebody/log.27Oct21.mos2_sw_mod.g++.1
@@ -0,0 +1,92 @@
+LAMMPS (27 Oct 2021)
+# monolayer MoS2
+units      	metal
+boundary   	p p f
+processors	* * 1
+
+atom_style 	atomic
+read_data       single_layer_MoS2.data
+Reading data file ...
+  triclinic box = (0.0000000 0.0000000 -100.00000) to (51.152320 44.299209 100.00000) with tilt (25.576160 0.0000000 0.0000000)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  768 atoms
+  read_data CPU = 0.043 seconds
+
+mass            * 32.065	# mass of sulphur atom , uint: a.u.=1.66X10^(-27)kg
+mass            1 95.94		# mass of molebdenum atom , uint: a.u.=1.66X10^(-27)kg
+
+########################## Define potentials ################################
+pair_style      sw/mod maxdelcs 0.25 0.35
+pair_coeff      * * tmd.sw.mod Mo S S
+Reading sw potential file tmd.sw.mod with DATE: 2018-03-26
+#########################################################################
+
+### Simulation settings ####
+timestep        0.001
+velocity        all create 300.0 12345
+
+############################
+
+# Output
+thermo          500
+thermo_style    custom step etotal pe ke temp
+thermo_modify   lost warn
+
+###### Run molecular dynamics ######
+fix             thermostat all nve
+run             5000
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 5.158796
+  ghost atom cutoff = 5.158796
+  binsize = 2.579398, bins = 30 18 78
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair sw/mod, perpetual
+      attributes: full, newton on
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.466 | 3.466 | 3.466 Mbytes
+Step TotEng PotEng KinEng Temp 
+       0   -899.28605   -929.02881    29.742759          300 
+     500   -899.28626   -922.45519    23.168929    233.69313 
+    1000   -899.29247   -925.86547    26.573002    268.02828 
+    1500   -899.27957   -916.95478    17.675214    178.28084 
+    2000   -899.28171   -918.38728    19.105573    192.70814 
+    2500   -899.28732   -922.50423     23.21691    234.17709 
+    3000   -899.28195   -918.74112    19.459174    196.27473 
+    3500   -899.27944   -918.03105    18.751604    189.13784 
+    4000   -899.28397   -920.50737    21.223397    214.06955 
+    4500   -899.28386   -919.79154    20.507685    206.85053 
+    5000   -899.28077   -918.78947    19.508698    196.77425 
+Loop time of 5.84317 on 1 procs for 5000 steps with 768 atoms
+
+Performance: 73.932 ns/day, 0.325 hours/ns, 855.700 timesteps/s
+99.8% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 5.6796     | 5.6796     | 5.6796     |   0.0 | 97.20
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0.026354   | 0.026354   | 0.026354   |   0.0 |  0.45
+Output  | 0.0014959  | 0.0014959  | 0.0014959  |   0.0 |  0.03
+Modify  | 0.090437   | 0.090437   | 0.090437   |   0.0 |  1.55
+Other   |            | 0.04524    |            |       |  0.77
+
+Nlocal:        768.000 ave         768 max         768 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        354.000 ave         354 max         354 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      20480.0 ave       20480 max       20480 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 20480
+Ave neighs/atom = 26.666667
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:06
diff --git a/examples/threebody/single_layer_MoS2.data b/examples/threebody/single_layer_MoS2.data
new file mode 100644
index 0000000000..e68230987c
--- /dev/null
+++ b/examples/threebody/single_layer_MoS2.data
@@ -0,0 +1,781 @@
+ Single layer MoS2
+
+   768  atoms
+
+ 3 atom types
+
+            0.0000000000000000            51.1523200000000177   xlo xhi
+            0.0000000000000000            44.2992085825108320   ylo yhi
+         -100.0000000000000000           100.0000000000000000   zlo zhi
+           25.5761600000000088             0.0000000000000000             0.0000000000000000 xy xz yz
+
+ Atoms
+
+      1    2    0.000000000000000    0.000000000000000   -1.596930000000000
+      2    3    0.000000000000000    0.000000000000000    1.596930000000000
+      3    1    0.000000000000000    1.845800357604618    0.000000000000000
+      4    2    1.598510000000001    2.768700536406927   -1.596930000000000
+      5    3    1.598510000000001    2.768700536406927    1.596930000000000
+      6    1    1.598510000000001    4.614500894011545    0.000000000000000
+      7    2    3.197020000000001    5.537401072813854   -1.596930000000000
+      8    3    3.197020000000001    5.537401072813854    1.596930000000000
+      9    1    3.197020000000001    7.383201430418472    0.000000000000000
+     10    2    4.795530000000002    8.306101609220781   -1.596930000000000
+     11    3    4.795530000000002    8.306101609220781    1.596930000000000
+     12    1    4.795530000000002   10.151901966825399    0.000000000000000
+     13    2    6.394040000000002   11.074802145627708   -1.596930000000000
+     14    3    6.394040000000002   11.074802145627708    1.596930000000000
+     15    1    6.394040000000002   12.920602503232326    0.000000000000000
+     16    2    7.992550000000003   13.843502682034635   -1.596930000000000
+     17    3    7.992550000000003   13.843502682034635    1.596930000000000
+     18    1    7.992550000000003   15.689303039639253    0.000000000000000
+     19    2    9.591060000000003   16.612203218441562   -1.596930000000000
+     20    3    9.591060000000003   16.612203218441562    1.596930000000000
+     21    1    9.591060000000003   18.458003576046180    0.000000000000000
+     22    2   11.189570000000004   19.380903754848489   -1.596930000000000
+     23    3   11.189570000000004   19.380903754848489    1.596930000000000
+     24    1   11.189570000000004   21.226704112453107    0.000000000000000
+     25    2   12.788080000000004   22.149604291255416   -1.596930000000000
+     26    3   12.788080000000004   22.149604291255416    1.596930000000000
+     27    1   12.788080000000004   23.995404648860034    0.000000000000000
+     28    2   14.386590000000005   24.918304827662343   -1.596930000000000
+     29    3   14.386590000000005   24.918304827662343    1.596930000000000
+     30    1   14.386590000000005   26.764105185266961    0.000000000000000
+     31    2   15.985100000000006   27.687005364069270   -1.596930000000000
+     32    3   15.985100000000006   27.687005364069270    1.596930000000000
+     33    1   15.985100000000006   29.532805721673888    0.000000000000000
+     34    2   17.583610000000006   30.455705900476197   -1.596930000000000
+     35    3   17.583610000000006   30.455705900476197    1.596930000000000
+     36    1   17.583610000000006   32.301506258080815    0.000000000000000
+     37    2   19.182120000000007   33.224406436883124   -1.596930000000000
+     38    3   19.182120000000007   33.224406436883124    1.596930000000000
+     39    1   19.182120000000007   35.070206794487742    0.000000000000000
+     40    2   20.780630000000007   35.993106973290051   -1.596930000000000
+     41    3   20.780630000000007   35.993106973290051    1.596930000000000
+     42    1   20.780630000000007   37.838907330894669    0.000000000000000
+     43    2   22.379140000000008   38.761807509696978   -1.596930000000000
+     44    3   22.379140000000008   38.761807509696978    1.596930000000000
+     45    1   22.379140000000008   40.607607867301596    0.000000000000000
+     46    2   23.977650000000008   41.530508046103905   -1.596930000000000
+     47    3   23.977650000000008   41.530508046103905    1.596930000000000
+     48    1   23.977650000000008   43.376308403708523    0.000000000000000
+     49    2    3.197020000000001    0.000000000000000   -1.596930000000000
+     50    3    3.197020000000001    0.000000000000000    1.596930000000000
+     51    1    3.197020000000001    1.845800357604618    0.000000000000000
+     52    2    4.795530000000002    2.768700536406927   -1.596930000000000
+     53    3    4.795530000000002    2.768700536406927    1.596930000000000
+     54    1    4.795530000000002    4.614500894011545    0.000000000000000
+     55    2    6.394040000000002    5.537401072813854   -1.596930000000000
+     56    3    6.394040000000002    5.537401072813854    1.596930000000000
+     57    1    6.394040000000002    7.383201430418472    0.000000000000000
+     58    2    7.992550000000003    8.306101609220781   -1.596930000000000
+     59    3    7.992550000000003    8.306101609220781    1.596930000000000
+     60    1    7.992550000000003   10.151901966825399    0.000000000000000
+     61    2    9.591060000000003   11.074802145627708   -1.596930000000000
+     62    3    9.591060000000003   11.074802145627708    1.596930000000000
+     63    1    9.591060000000003   12.920602503232326    0.000000000000000
+     64    2   11.189570000000004   13.843502682034635   -1.596930000000000
+     65    3   11.189570000000004   13.843502682034635    1.596930000000000
+     66    1   11.189570000000004   15.689303039639253    0.000000000000000
+     67    2   12.788080000000004   16.612203218441562   -1.596930000000000
+     68    3   12.788080000000004   16.612203218441562    1.596930000000000
+     69    1   12.788080000000004   18.458003576046180    0.000000000000000
+     70    2   14.386590000000005   19.380903754848489   -1.596930000000000
+     71    3   14.386590000000005   19.380903754848489    1.596930000000000
+     72    1   14.386590000000005   21.226704112453107    0.000000000000000
+     73    2   15.985100000000006   22.149604291255416   -1.596930000000000
+     74    3   15.985100000000006   22.149604291255416    1.596930000000000
+     75    1   15.985100000000006   23.995404648860034    0.000000000000000
+     76    2   17.583610000000006   24.918304827662343   -1.596930000000000
+     77    3   17.583610000000006   24.918304827662343    1.596930000000000
+     78    1   17.583610000000006   26.764105185266961    0.000000000000000
+     79    2   19.182120000000007   27.687005364069270   -1.596930000000000
+     80    3   19.182120000000007   27.687005364069270    1.596930000000000
+     81    1   19.182120000000007   29.532805721673888    0.000000000000000
+     82    2   20.780630000000007   30.455705900476197   -1.596930000000000
+     83    3   20.780630000000007   30.455705900476197    1.596930000000000
+     84    1   20.780630000000007   32.301506258080815    0.000000000000000
+     85    2   22.379140000000008   33.224406436883124   -1.596930000000000
+     86    3   22.379140000000008   33.224406436883124    1.596930000000000
+     87    1   22.379140000000008   35.070206794487742    0.000000000000000
+     88    2   23.977650000000008   35.993106973290051   -1.596930000000000
+     89    3   23.977650000000008   35.993106973290051    1.596930000000000
+     90    1   23.977650000000008   37.838907330894669    0.000000000000000
+     91    2   25.576160000000009   38.761807509696978   -1.596930000000000
+     92    3   25.576160000000009   38.761807509696978    1.596930000000000
+     93    1   25.576160000000009   40.607607867301596    0.000000000000000
+     94    2   27.174670000000009   41.530508046103905   -1.596930000000000
+     95    3   27.174670000000009   41.530508046103905    1.596930000000000
+     96    1   27.174670000000009   43.376308403708523    0.000000000000000
+     97    2    6.394040000000002    0.000000000000000   -1.596930000000000
+     98    3    6.394040000000002    0.000000000000000    1.596930000000000
+     99    1    6.394040000000002    1.845800357604618    0.000000000000000
+    100    2    7.992550000000003    2.768700536406927   -1.596930000000000
+    101    3    7.992550000000003    2.768700536406927    1.596930000000000
+    102    1    7.992550000000003    4.614500894011545    0.000000000000000
+    103    2    9.591060000000003    5.537401072813854   -1.596930000000000
+    104    3    9.591060000000003    5.537401072813854    1.596930000000000
+    105    1    9.591060000000003    7.383201430418472    0.000000000000000
+    106    2   11.189570000000004    8.306101609220781   -1.596930000000000
+    107    3   11.189570000000004    8.306101609220781    1.596930000000000
+    108    1   11.189570000000004   10.151901966825399    0.000000000000000
+    109    2   12.788080000000004   11.074802145627708   -1.596930000000000
+    110    3   12.788080000000004   11.074802145627708    1.596930000000000
+    111    1   12.788080000000004   12.920602503232326    0.000000000000000
+    112    2   14.386590000000005   13.843502682034635   -1.596930000000000
+    113    3   14.386590000000005   13.843502682034635    1.596930000000000
+    114    1   14.386590000000005   15.689303039639253    0.000000000000000
+    115    2   15.985100000000006   16.612203218441562   -1.596930000000000
+    116    3   15.985100000000006   16.612203218441562    1.596930000000000
+    117    1   15.985100000000006   18.458003576046180    0.000000000000000
+    118    2   17.583610000000006   19.380903754848489   -1.596930000000000
+    119    3   17.583610000000006   19.380903754848489    1.596930000000000
+    120    1   17.583610000000006   21.226704112453107    0.000000000000000
+    121    2   19.182120000000007   22.149604291255416   -1.596930000000000
+    122    3   19.182120000000007   22.149604291255416    1.596930000000000
+    123    1   19.182120000000007   23.995404648860034    0.000000000000000
+    124    2   20.780630000000007   24.918304827662343   -1.596930000000000
+    125    3   20.780630000000007   24.918304827662343    1.596930000000000
+    126    1   20.780630000000007   26.764105185266961    0.000000000000000
+    127    2   22.379140000000008   27.687005364069270   -1.596930000000000
+    128    3   22.379140000000008   27.687005364069270    1.596930000000000
+    129    1   22.379140000000008   29.532805721673888    0.000000000000000
+    130    2   23.977650000000008   30.455705900476197   -1.596930000000000
+    131    3   23.977650000000008   30.455705900476197    1.596930000000000
+    132    1   23.977650000000008   32.301506258080815    0.000000000000000
+    133    2   25.576160000000009   33.224406436883124   -1.596930000000000
+    134    3   25.576160000000009   33.224406436883124    1.596930000000000
+    135    1   25.576160000000009   35.070206794487742    0.000000000000000
+    136    2   27.174670000000009   35.993106973290051   -1.596930000000000
+    137    3   27.174670000000009   35.993106973290051    1.596930000000000
+    138    1   27.174670000000009   37.838907330894669    0.000000000000000
+    139    2   28.773180000000010   38.761807509696978   -1.596930000000000
+    140    3   28.773180000000010   38.761807509696978    1.596930000000000
+    141    1   28.773180000000010   40.607607867301596    0.000000000000000
+    142    2   30.371690000000011   41.530508046103905   -1.596930000000000
+    143    3   30.371690000000011   41.530508046103905    1.596930000000000
+    144    1   30.371690000000011   43.376308403708523    0.000000000000000
+    145    2    9.591060000000003    0.000000000000000   -1.596930000000000
+    146    3    9.591060000000003    0.000000000000000    1.596930000000000
+    147    1    9.591060000000003    1.845800357604618    0.000000000000000
+    148    2   11.189570000000004    2.768700536406927   -1.596930000000000
+    149    3   11.189570000000004    2.768700536406927    1.596930000000000
+    150    1   11.189570000000004    4.614500894011545    0.000000000000000
+    151    2   12.788080000000004    5.537401072813854   -1.596930000000000
+    152    3   12.788080000000004    5.537401072813854    1.596930000000000
+    153    1   12.788080000000004    7.383201430418472    0.000000000000000
+    154    2   14.386590000000005    8.306101609220781   -1.596930000000000
+    155    3   14.386590000000005    8.306101609220781    1.596930000000000
+    156    1   14.386590000000005   10.151901966825399    0.000000000000000
+    157    2   15.985100000000006   11.074802145627708   -1.596930000000000
+    158    3   15.985100000000006   11.074802145627708    1.596930000000000
+    159    1   15.985100000000006   12.920602503232326    0.000000000000000
+    160    2   17.583610000000006   13.843502682034635   -1.596930000000000
+    161    3   17.583610000000006   13.843502682034635    1.596930000000000
+    162    1   17.583610000000006   15.689303039639253    0.000000000000000
+    163    2   19.182120000000007   16.612203218441562   -1.596930000000000
+    164    3   19.182120000000007   16.612203218441562    1.596930000000000
+    165    1   19.182120000000007   18.458003576046180    0.000000000000000
+    166    2   20.780630000000007   19.380903754848489   -1.596930000000000
+    167    3   20.780630000000007   19.380903754848489    1.596930000000000
+    168    1   20.780630000000007   21.226704112453107    0.000000000000000
+    169    2   22.379140000000008   22.149604291255416   -1.596930000000000
+    170    3   22.379140000000008   22.149604291255416    1.596930000000000
+    171    1   22.379140000000008   23.995404648860034    0.000000000000000
+    172    2   23.977650000000008   24.918304827662343   -1.596930000000000
+    173    3   23.977650000000008   24.918304827662343    1.596930000000000
+    174    1   23.977650000000008   26.764105185266961    0.000000000000000
+    175    2   25.576160000000009   27.687005364069270   -1.596930000000000
+    176    3   25.576160000000009   27.687005364069270    1.596930000000000
+    177    1   25.576160000000009   29.532805721673888    0.000000000000000
+    178    2   27.174670000000009   30.455705900476197   -1.596930000000000
+    179    3   27.174670000000009   30.455705900476197    1.596930000000000
+    180    1   27.174670000000009   32.301506258080815    0.000000000000000
+    181    2   28.773180000000010   33.224406436883124   -1.596930000000000
+    182    3   28.773180000000010   33.224406436883124    1.596930000000000
+    183    1   28.773180000000010   35.070206794487742    0.000000000000000
+    184    2   30.371690000000011   35.993106973290051   -1.596930000000000
+    185    3   30.371690000000011   35.993106973290051    1.596930000000000
+    186    1   30.371690000000011   37.838907330894669    0.000000000000000
+    187    2   31.970200000000011   38.761807509696978   -1.596930000000000
+    188    3   31.970200000000011   38.761807509696978    1.596930000000000
+    189    1   31.970200000000011   40.607607867301596    0.000000000000000
+    190    2   33.568710000000012   41.530508046103905   -1.596930000000000
+    191    3   33.568710000000012   41.530508046103905    1.596930000000000
+    192    1   33.568710000000012   43.376308403708523    0.000000000000000
+    193    2   12.788080000000004    0.000000000000000   -1.596930000000000
+    194    3   12.788080000000004    0.000000000000000    1.596930000000000
+    195    1   12.788080000000004    1.845800357604618    0.000000000000000
+    196    2   14.386590000000005    2.768700536406927   -1.596930000000000
+    197    3   14.386590000000005    2.768700536406927    1.596930000000000
+    198    1   14.386590000000005    4.614500894011545    0.000000000000000
+    199    2   15.985100000000006    5.537401072813854   -1.596930000000000
+    200    3   15.985100000000006    5.537401072813854    1.596930000000000
+    201    1   15.985100000000006    7.383201430418472    0.000000000000000
+    202    2   17.583610000000006    8.306101609220781   -1.596930000000000
+    203    3   17.583610000000006    8.306101609220781    1.596930000000000
+    204    1   17.583610000000006   10.151901966825399    0.000000000000000
+    205    2   19.182120000000007   11.074802145627708   -1.596930000000000
+    206    3   19.182120000000007   11.074802145627708    1.596930000000000
+    207    1   19.182120000000007   12.920602503232326    0.000000000000000
+    208    2   20.780630000000007   13.843502682034635   -1.596930000000000
+    209    3   20.780630000000007   13.843502682034635    1.596930000000000
+    210    1   20.780630000000007   15.689303039639253    0.000000000000000
+    211    2   22.379140000000008   16.612203218441562   -1.596930000000000
+    212    3   22.379140000000008   16.612203218441562    1.596930000000000
+    213    1   22.379140000000008   18.458003576046180    0.000000000000000
+    214    2   23.977650000000008   19.380903754848489   -1.596930000000000
+    215    3   23.977650000000008   19.380903754848489    1.596930000000000
+    216    1   23.977650000000008   21.226704112453107    0.000000000000000
+    217    2   25.576160000000009   22.149604291255416   -1.596930000000000
+    218    3   25.576160000000009   22.149604291255416    1.596930000000000
+    219    1   25.576160000000009   23.995404648860034    0.000000000000000
+    220    2   27.174670000000009   24.918304827662343   -1.596930000000000
+    221    3   27.174670000000009   24.918304827662343    1.596930000000000
+    222    1   27.174670000000009   26.764105185266961    0.000000000000000
+    223    2   28.773180000000010   27.687005364069270   -1.596930000000000
+    224    3   28.773180000000010   27.687005364069270    1.596930000000000
+    225    1   28.773180000000010   29.532805721673888    0.000000000000000
+    226    2   30.371690000000011   30.455705900476197   -1.596930000000000
+    227    3   30.371690000000011   30.455705900476197    1.596930000000000
+    228    1   30.371690000000011   32.301506258080815    0.000000000000000
+    229    2   31.970200000000011   33.224406436883124   -1.596930000000000
+    230    3   31.970200000000011   33.224406436883124    1.596930000000000
+    231    1   31.970200000000011   35.070206794487742    0.000000000000000
+    232    2   33.568710000000012   35.993106973290051   -1.596930000000000
+    233    3   33.568710000000012   35.993106973290051    1.596930000000000
+    234    1   33.568710000000012   37.838907330894669    0.000000000000000
+    235    2   35.167220000000012   38.761807509696978   -1.596930000000000
+    236    3   35.167220000000012   38.761807509696978    1.596930000000000
+    237    1   35.167220000000012   40.607607867301596    0.000000000000000
+    238    2   36.765730000000013   41.530508046103905   -1.596930000000000
+    239    3   36.765730000000013   41.530508046103905    1.596930000000000
+    240    1   36.765730000000013   43.376308403708523    0.000000000000000
+    241    2   15.985100000000006    0.000000000000000   -1.596930000000000
+    242    3   15.985100000000006    0.000000000000000    1.596930000000000
+    243    1   15.985100000000006    1.845800357604618    0.000000000000000
+    244    2   17.583610000000006    2.768700536406927   -1.596930000000000
+    245    3   17.583610000000006    2.768700536406927    1.596930000000000
+    246    1   17.583610000000006    4.614500894011545    0.000000000000000
+    247    2   19.182120000000007    5.537401072813854   -1.596930000000000
+    248    3   19.182120000000007    5.537401072813854    1.596930000000000
+    249    1   19.182120000000007    7.383201430418472    0.000000000000000
+    250    2   20.780630000000007    8.306101609220781   -1.596930000000000
+    251    3   20.780630000000007    8.306101609220781    1.596930000000000
+    252    1   20.780630000000007   10.151901966825399    0.000000000000000
+    253    2   22.379140000000008   11.074802145627708   -1.596930000000000
+    254    3   22.379140000000008   11.074802145627708    1.596930000000000
+    255    1   22.379140000000008   12.920602503232326    0.000000000000000
+    256    2   23.977650000000008   13.843502682034635   -1.596930000000000
+    257    3   23.977650000000008   13.843502682034635    1.596930000000000
+    258    1   23.977650000000008   15.689303039639253    0.000000000000000
+    259    2   25.576160000000009   16.612203218441562   -1.596930000000000
+    260    3   25.576160000000009   16.612203218441562    1.596930000000000
+    261    1   25.576160000000009   18.458003576046180    0.000000000000000
+    262    2   27.174670000000009   19.380903754848489   -1.596930000000000
+    263    3   27.174670000000009   19.380903754848489    1.596930000000000
+    264    1   27.174670000000009   21.226704112453107    0.000000000000000
+    265    2   28.773180000000010   22.149604291255416   -1.596930000000000
+    266    3   28.773180000000010   22.149604291255416    1.596930000000000
+    267    1   28.773180000000010   23.995404648860034    0.000000000000000
+    268    2   30.371690000000011   24.918304827662343   -1.596930000000000
+    269    3   30.371690000000011   24.918304827662343    1.596930000000000
+    270    1   30.371690000000011   26.764105185266961    0.000000000000000
+    271    2   31.970200000000011   27.687005364069270   -1.596930000000000
+    272    3   31.970200000000011   27.687005364069270    1.596930000000000
+    273    1   31.970200000000011   29.532805721673888    0.000000000000000
+    274    2   33.568710000000012   30.455705900476197   -1.596930000000000
+    275    3   33.568710000000012   30.455705900476197    1.596930000000000
+    276    1   33.568710000000012   32.301506258080815    0.000000000000000
+    277    2   35.167220000000012   33.224406436883124   -1.596930000000000
+    278    3   35.167220000000012   33.224406436883124    1.596930000000000
+    279    1   35.167220000000012   35.070206794487742    0.000000000000000
+    280    2   36.765730000000013   35.993106973290051   -1.596930000000000
+    281    3   36.765730000000013   35.993106973290051    1.596930000000000
+    282    1   36.765730000000013   37.838907330894669    0.000000000000000
+    283    2   38.364240000000013   38.761807509696978   -1.596930000000000
+    284    3   38.364240000000013   38.761807509696978    1.596930000000000
+    285    1   38.364240000000013   40.607607867301596    0.000000000000000
+    286    2   39.962750000000014   41.530508046103905   -1.596930000000000
+    287    3   39.962750000000014   41.530508046103905    1.596930000000000
+    288    1   39.962750000000014   43.376308403708523    0.000000000000000
+    289    2   19.182120000000007    0.000000000000000   -1.596930000000000
+    290    3   19.182120000000007    0.000000000000000    1.596930000000000
+    291    1   19.182120000000007    1.845800357604618    0.000000000000000
+    292    2   20.780630000000007    2.768700536406927   -1.596930000000000
+    293    3   20.780630000000007    2.768700536406927    1.596930000000000
+    294    1   20.780630000000007    4.614500894011545    0.000000000000000
+    295    2   22.379140000000008    5.537401072813854   -1.596930000000000
+    296    3   22.379140000000008    5.537401072813854    1.596930000000000
+    297    1   22.379140000000008    7.383201430418472    0.000000000000000
+    298    2   23.977650000000008    8.306101609220781   -1.596930000000000
+    299    3   23.977650000000008    8.306101609220781    1.596930000000000
+    300    1   23.977650000000008   10.151901966825399    0.000000000000000
+    301    2   25.576160000000009   11.074802145627708   -1.596930000000000
+    302    3   25.576160000000009   11.074802145627708    1.596930000000000
+    303    1   25.576160000000009   12.920602503232326    0.000000000000000
+    304    2   27.174670000000009   13.843502682034635   -1.596930000000000
+    305    3   27.174670000000009   13.843502682034635    1.596930000000000
+    306    1   27.174670000000009   15.689303039639253    0.000000000000000
+    307    2   28.773180000000010   16.612203218441562   -1.596930000000000
+    308    3   28.773180000000010   16.612203218441562    1.596930000000000
+    309    1   28.773180000000010   18.458003576046180    0.000000000000000
+    310    2   30.371690000000011   19.380903754848489   -1.596930000000000
+    311    3   30.371690000000011   19.380903754848489    1.596930000000000
+    312    1   30.371690000000011   21.226704112453107    0.000000000000000
+    313    2   31.970200000000011   22.149604291255416   -1.596930000000000
+    314    3   31.970200000000011   22.149604291255416    1.596930000000000
+    315    1   31.970200000000011   23.995404648860034    0.000000000000000
+    316    2   33.568710000000012   24.918304827662343   -1.596930000000000
+    317    3   33.568710000000012   24.918304827662343    1.596930000000000
+    318    1   33.568710000000012   26.764105185266961    0.000000000000000
+    319    2   35.167220000000012   27.687005364069270   -1.596930000000000
+    320    3   35.167220000000012   27.687005364069270    1.596930000000000
+    321    1   35.167220000000012   29.532805721673888    0.000000000000000
+    322    2   36.765730000000013   30.455705900476197   -1.596930000000000
+    323    3   36.765730000000013   30.455705900476197    1.596930000000000
+    324    1   36.765730000000013   32.301506258080815    0.000000000000000
+    325    2   38.364240000000013   33.224406436883124   -1.596930000000000
+    326    3   38.364240000000013   33.224406436883124    1.596930000000000
+    327    1   38.364240000000013   35.070206794487742    0.000000000000000
+    328    2   39.962750000000014   35.993106973290051   -1.596930000000000
+    329    3   39.962750000000014   35.993106973290051    1.596930000000000
+    330    1   39.962750000000014   37.838907330894669    0.000000000000000
+    331    2   41.561260000000014   38.761807509696978   -1.596930000000000
+    332    3   41.561260000000014   38.761807509696978    1.596930000000000
+    333    1   41.561260000000014   40.607607867301596    0.000000000000000
+    334    2   43.159770000000015   41.530508046103905   -1.596930000000000
+    335    3   43.159770000000015   41.530508046103905    1.596930000000000
+    336    1   43.159770000000015   43.376308403708523    0.000000000000000
+    337    2   22.379140000000008    0.000000000000000   -1.596930000000000
+    338    3   22.379140000000008    0.000000000000000    1.596930000000000
+    339    1   22.379140000000008    1.845800357604618    0.000000000000000
+    340    2   23.977650000000008    2.768700536406927   -1.596930000000000
+    341    3   23.977650000000008    2.768700536406927    1.596930000000000
+    342    1   23.977650000000008    4.614500894011545    0.000000000000000
+    343    2   25.576160000000009    5.537401072813854   -1.596930000000000
+    344    3   25.576160000000009    5.537401072813854    1.596930000000000
+    345    1   25.576160000000009    7.383201430418472    0.000000000000000
+    346    2   27.174670000000009    8.306101609220781   -1.596930000000000
+    347    3   27.174670000000009    8.306101609220781    1.596930000000000
+    348    1   27.174670000000009   10.151901966825399    0.000000000000000
+    349    2   28.773180000000010   11.074802145627708   -1.596930000000000
+    350    3   28.773180000000010   11.074802145627708    1.596930000000000
+    351    1   28.773180000000010   12.920602503232326    0.000000000000000
+    352    2   30.371690000000011   13.843502682034635   -1.596930000000000
+    353    3   30.371690000000011   13.843502682034635    1.596930000000000
+    354    1   30.371690000000011   15.689303039639253    0.000000000000000
+    355    2   31.970200000000011   16.612203218441562   -1.596930000000000
+    356    3   31.970200000000011   16.612203218441562    1.596930000000000
+    357    1   31.970200000000011   18.458003576046180    0.000000000000000
+    358    2   33.568710000000012   19.380903754848489   -1.596930000000000
+    359    3   33.568710000000012   19.380903754848489    1.596930000000000
+    360    1   33.568710000000012   21.226704112453107    0.000000000000000
+    361    2   35.167220000000012   22.149604291255416   -1.596930000000000
+    362    3   35.167220000000012   22.149604291255416    1.596930000000000
+    363    1   35.167220000000012   23.995404648860034    0.000000000000000
+    364    2   36.765730000000013   24.918304827662343   -1.596930000000000
+    365    3   36.765730000000013   24.918304827662343    1.596930000000000
+    366    1   36.765730000000013   26.764105185266961    0.000000000000000
+    367    2   38.364240000000013   27.687005364069270   -1.596930000000000
+    368    3   38.364240000000013   27.687005364069270    1.596930000000000
+    369    1   38.364240000000013   29.532805721673888    0.000000000000000
+    370    2   39.962750000000014   30.455705900476197   -1.596930000000000
+    371    3   39.962750000000014   30.455705900476197    1.596930000000000
+    372    1   39.962750000000014   32.301506258080815    0.000000000000000
+    373    2   41.561260000000014   33.224406436883124   -1.596930000000000
+    374    3   41.561260000000014   33.224406436883124    1.596930000000000
+    375    1   41.561260000000014   35.070206794487742    0.000000000000000
+    376    2   43.159770000000015   35.993106973290051   -1.596930000000000
+    377    3   43.159770000000015   35.993106973290051    1.596930000000000
+    378    1   43.159770000000015   37.838907330894669    0.000000000000000
+    379    2   44.758280000000015   38.761807509696978   -1.596930000000000
+    380    3   44.758280000000015   38.761807509696978    1.596930000000000
+    381    1   44.758280000000015   40.607607867301596    0.000000000000000
+    382    2   46.356790000000016   41.530508046103905   -1.596930000000000
+    383    3   46.356790000000016   41.530508046103905    1.596930000000000
+    384    1   46.356790000000016   43.376308403708523    0.000000000000000
+    385    2   25.576160000000009    0.000000000000000   -1.596930000000000
+    386    3   25.576160000000009    0.000000000000000    1.596930000000000
+    387    1   25.576160000000009    1.845800357604618    0.000000000000000
+    388    2   27.174670000000009    2.768700536406927   -1.596930000000000
+    389    3   27.174670000000009    2.768700536406927    1.596930000000000
+    390    1   27.174670000000009    4.614500894011545    0.000000000000000
+    391    2   28.773180000000010    5.537401072813854   -1.596930000000000
+    392    3   28.773180000000010    5.537401072813854    1.596930000000000
+    393    1   28.773180000000010    7.383201430418472    0.000000000000000
+    394    2   30.371690000000011    8.306101609220781   -1.596930000000000
+    395    3   30.371690000000011    8.306101609220781    1.596930000000000
+    396    1   30.371690000000011   10.151901966825399    0.000000000000000
+    397    2   31.970200000000011   11.074802145627708   -1.596930000000000
+    398    3   31.970200000000011   11.074802145627708    1.596930000000000
+    399    1   31.970200000000011   12.920602503232326    0.000000000000000
+    400    2   33.568710000000012   13.843502682034635   -1.596930000000000
+    401    3   33.568710000000012   13.843502682034635    1.596930000000000
+    402    1   33.568710000000012   15.689303039639253    0.000000000000000
+    403    2   35.167220000000012   16.612203218441562   -1.596930000000000
+    404    3   35.167220000000012   16.612203218441562    1.596930000000000
+    405    1   35.167220000000012   18.458003576046180    0.000000000000000
+    406    2   36.765730000000013   19.380903754848489   -1.596930000000000
+    407    3   36.765730000000013   19.380903754848489    1.596930000000000
+    408    1   36.765730000000013   21.226704112453107    0.000000000000000
+    409    2   38.364240000000013   22.149604291255416   -1.596930000000000
+    410    3   38.364240000000013   22.149604291255416    1.596930000000000
+    411    1   38.364240000000013   23.995404648860034    0.000000000000000
+    412    2   39.962750000000014   24.918304827662343   -1.596930000000000
+    413    3   39.962750000000014   24.918304827662343    1.596930000000000
+    414    1   39.962750000000014   26.764105185266961    0.000000000000000
+    415    2   41.561260000000014   27.687005364069270   -1.596930000000000
+    416    3   41.561260000000014   27.687005364069270    1.596930000000000
+    417    1   41.561260000000014   29.532805721673888    0.000000000000000
+    418    2   43.159770000000015   30.455705900476197   -1.596930000000000
+    419    3   43.159770000000015   30.455705900476197    1.596930000000000
+    420    1   43.159770000000015   32.301506258080815    0.000000000000000
+    421    2   44.758280000000015   33.224406436883124   -1.596930000000000
+    422    3   44.758280000000015   33.224406436883124    1.596930000000000
+    423    1   44.758280000000015   35.070206794487742    0.000000000000000
+    424    2   46.356790000000016   35.993106973290051   -1.596930000000000
+    425    3   46.356790000000016   35.993106973290051    1.596930000000000
+    426    1   46.356790000000016   37.838907330894669    0.000000000000000
+    427    2   47.955300000000017   38.761807509696978   -1.596930000000000
+    428    3   47.955300000000017   38.761807509696978    1.596930000000000
+    429    1   47.955300000000017   40.607607867301596    0.000000000000000
+    430    2   49.553810000000017   41.530508046103905   -1.596930000000000
+    431    3   49.553810000000017   41.530508046103905    1.596930000000000
+    432    1   49.553810000000017   43.376308403708523    0.000000000000000
+    433    2   28.773180000000010    0.000000000000000   -1.596930000000000
+    434    3   28.773180000000010    0.000000000000000    1.596930000000000
+    435    1   28.773180000000010    1.845800357604618    0.000000000000000
+    436    2   30.371690000000011    2.768700536406927   -1.596930000000000
+    437    3   30.371690000000011    2.768700536406927    1.596930000000000
+    438    1   30.371690000000011    4.614500894011545    0.000000000000000
+    439    2   31.970200000000011    5.537401072813854   -1.596930000000000
+    440    3   31.970200000000011    5.537401072813854    1.596930000000000
+    441    1   31.970200000000011    7.383201430418472    0.000000000000000
+    442    2   33.568710000000012    8.306101609220781   -1.596930000000000
+    443    3   33.568710000000012    8.306101609220781    1.596930000000000
+    444    1   33.568710000000012   10.151901966825399    0.000000000000000
+    445    2   35.167220000000012   11.074802145627708   -1.596930000000000
+    446    3   35.167220000000012   11.074802145627708    1.596930000000000
+    447    1   35.167220000000012   12.920602503232326    0.000000000000000
+    448    2   36.765730000000013   13.843502682034635   -1.596930000000000
+    449    3   36.765730000000013   13.843502682034635    1.596930000000000
+    450    1   36.765730000000013   15.689303039639253    0.000000000000000
+    451    2   38.364240000000013   16.612203218441562   -1.596930000000000
+    452    3   38.364240000000013   16.612203218441562    1.596930000000000
+    453    1   38.364240000000013   18.458003576046180    0.000000000000000
+    454    2   39.962750000000014   19.380903754848489   -1.596930000000000
+    455    3   39.962750000000014   19.380903754848489    1.596930000000000
+    456    1   39.962750000000014   21.226704112453107    0.000000000000000
+    457    2   41.561260000000014   22.149604291255416   -1.596930000000000
+    458    3   41.561260000000014   22.149604291255416    1.596930000000000
+    459    1   41.561260000000014   23.995404648860034    0.000000000000000
+    460    2   43.159770000000015   24.918304827662343   -1.596930000000000
+    461    3   43.159770000000015   24.918304827662343    1.596930000000000
+    462    1   43.159770000000015   26.764105185266961    0.000000000000000
+    463    2   44.758280000000015   27.687005364069270   -1.596930000000000
+    464    3   44.758280000000015   27.687005364069270    1.596930000000000
+    465    1   44.758280000000015   29.532805721673888    0.000000000000000
+    466    2   46.356790000000016   30.455705900476197   -1.596930000000000
+    467    3   46.356790000000016   30.455705900476197    1.596930000000000
+    468    1   46.356790000000016   32.301506258080815    0.000000000000000
+    469    2   47.955300000000017   33.224406436883124   -1.596930000000000
+    470    3   47.955300000000017   33.224406436883124    1.596930000000000
+    471    1   47.955300000000017   35.070206794487742    0.000000000000000
+    472    2   49.553810000000017   35.993106973290051   -1.596930000000000
+    473    3   49.553810000000017   35.993106973290051    1.596930000000000
+    474    1   49.553810000000017   37.838907330894669    0.000000000000000
+    475    2   51.152320000000018   38.761807509696978   -1.596930000000000
+    476    3   51.152320000000018   38.761807509696978    1.596930000000000
+    477    1   51.152320000000018   40.607607867301596    0.000000000000000
+    478    2   52.750830000000018   41.530508046103905   -1.596930000000000
+    479    3   52.750830000000018   41.530508046103905    1.596930000000000
+    480    1   52.750830000000018   43.376308403708523    0.000000000000000
+    481    2   31.970200000000011    0.000000000000000   -1.596930000000000
+    482    3   31.970200000000011    0.000000000000000    1.596930000000000
+    483    1   31.970200000000011    1.845800357604618    0.000000000000000
+    484    2   33.568710000000012    2.768700536406927   -1.596930000000000
+    485    3   33.568710000000012    2.768700536406927    1.596930000000000
+    486    1   33.568710000000012    4.614500894011545    0.000000000000000
+    487    2   35.167220000000012    5.537401072813854   -1.596930000000000
+    488    3   35.167220000000012    5.537401072813854    1.596930000000000
+    489    1   35.167220000000012    7.383201430418472    0.000000000000000
+    490    2   36.765730000000013    8.306101609220781   -1.596930000000000
+    491    3   36.765730000000013    8.306101609220781    1.596930000000000
+    492    1   36.765730000000013   10.151901966825399    0.000000000000000
+    493    2   38.364240000000013   11.074802145627708   -1.596930000000000
+    494    3   38.364240000000013   11.074802145627708    1.596930000000000
+    495    1   38.364240000000013   12.920602503232326    0.000000000000000
+    496    2   39.962750000000014   13.843502682034635   -1.596930000000000
+    497    3   39.962750000000014   13.843502682034635    1.596930000000000
+    498    1   39.962750000000014   15.689303039639253    0.000000000000000
+    499    2   41.561260000000014   16.612203218441562   -1.596930000000000
+    500    3   41.561260000000014   16.612203218441562    1.596930000000000
+    501    1   41.561260000000014   18.458003576046180    0.000000000000000
+    502    2   43.159770000000015   19.380903754848489   -1.596930000000000
+    503    3   43.159770000000015   19.380903754848489    1.596930000000000
+    504    1   43.159770000000015   21.226704112453107    0.000000000000000
+    505    2   44.758280000000015   22.149604291255416   -1.596930000000000
+    506    3   44.758280000000015   22.149604291255416    1.596930000000000
+    507    1   44.758280000000015   23.995404648860034    0.000000000000000
+    508    2   46.356790000000016   24.918304827662343   -1.596930000000000
+    509    3   46.356790000000016   24.918304827662343    1.596930000000000
+    510    1   46.356790000000016   26.764105185266961    0.000000000000000
+    511    2   47.955300000000017   27.687005364069270   -1.596930000000000
+    512    3   47.955300000000017   27.687005364069270    1.596930000000000
+    513    1   47.955300000000017   29.532805721673888    0.000000000000000
+    514    2   49.553810000000017   30.455705900476197   -1.596930000000000
+    515    3   49.553810000000017   30.455705900476197    1.596930000000000
+    516    1   49.553810000000017   32.301506258080815    0.000000000000000
+    517    2   51.152320000000018   33.224406436883124   -1.596930000000000
+    518    3   51.152320000000018   33.224406436883124    1.596930000000000
+    519    1   51.152320000000018   35.070206794487742    0.000000000000000
+    520    2   52.750830000000018   35.993106973290051   -1.596930000000000
+    521    3   52.750830000000018   35.993106973290051    1.596930000000000
+    522    1   52.750830000000018   37.838907330894669    0.000000000000000
+    523    2   54.349340000000019   38.761807509696978   -1.596930000000000
+    524    3   54.349340000000019   38.761807509696978    1.596930000000000
+    525    1   54.349340000000019   40.607607867301596    0.000000000000000
+    526    2   55.947850000000019   41.530508046103905   -1.596930000000000
+    527    3   55.947850000000019   41.530508046103905    1.596930000000000
+    528    1   55.947850000000019   43.376308403708523    0.000000000000000
+    529    2   35.167220000000012    0.000000000000000   -1.596930000000000
+    530    3   35.167220000000012    0.000000000000000    1.596930000000000
+    531    1   35.167220000000012    1.845800357604618    0.000000000000000
+    532    2   36.765730000000013    2.768700536406927   -1.596930000000000
+    533    3   36.765730000000013    2.768700536406927    1.596930000000000
+    534    1   36.765730000000013    4.614500894011545    0.000000000000000
+    535    2   38.364240000000013    5.537401072813854   -1.596930000000000
+    536    3   38.364240000000013    5.537401072813854    1.596930000000000
+    537    1   38.364240000000013    7.383201430418472    0.000000000000000
+    538    2   39.962750000000014    8.306101609220781   -1.596930000000000
+    539    3   39.962750000000014    8.306101609220781    1.596930000000000
+    540    1   39.962750000000014   10.151901966825399    0.000000000000000
+    541    2   41.561260000000014   11.074802145627708   -1.596930000000000
+    542    3   41.561260000000014   11.074802145627708    1.596930000000000
+    543    1   41.561260000000014   12.920602503232326    0.000000000000000
+    544    2   43.159770000000015   13.843502682034635   -1.596930000000000
+    545    3   43.159770000000015   13.843502682034635    1.596930000000000
+    546    1   43.159770000000015   15.689303039639253    0.000000000000000
+    547    2   44.758280000000015   16.612203218441562   -1.596930000000000
+    548    3   44.758280000000015   16.612203218441562    1.596930000000000
+    549    1   44.758280000000015   18.458003576046180    0.000000000000000
+    550    2   46.356790000000016   19.380903754848489   -1.596930000000000
+    551    3   46.356790000000016   19.380903754848489    1.596930000000000
+    552    1   46.356790000000016   21.226704112453107    0.000000000000000
+    553    2   47.955300000000017   22.149604291255416   -1.596930000000000
+    554    3   47.955300000000017   22.149604291255416    1.596930000000000
+    555    1   47.955300000000017   23.995404648860034    0.000000000000000
+    556    2   49.553810000000017   24.918304827662343   -1.596930000000000
+    557    3   49.553810000000017   24.918304827662343    1.596930000000000
+    558    1   49.553810000000017   26.764105185266961    0.000000000000000
+    559    2   51.152320000000018   27.687005364069270   -1.596930000000000
+    560    3   51.152320000000018   27.687005364069270    1.596930000000000
+    561    1   51.152320000000018   29.532805721673888    0.000000000000000
+    562    2   52.750830000000018   30.455705900476197   -1.596930000000000
+    563    3   52.750830000000018   30.455705900476197    1.596930000000000
+    564    1   52.750830000000018   32.301506258080815    0.000000000000000
+    565    2   54.349340000000019   33.224406436883124   -1.596930000000000
+    566    3   54.349340000000019   33.224406436883124    1.596930000000000
+    567    1   54.349340000000019   35.070206794487742    0.000000000000000
+    568    2   55.947850000000019   35.993106973290051   -1.596930000000000
+    569    3   55.947850000000019   35.993106973290051    1.596930000000000
+    570    1   55.947850000000019   37.838907330894669    0.000000000000000
+    571    2   57.546360000000020   38.761807509696978   -1.596930000000000
+    572    3   57.546360000000020   38.761807509696978    1.596930000000000
+    573    1   57.546360000000020   40.607607867301596    0.000000000000000
+    574    2   59.144870000000020   41.530508046103905   -1.596930000000000
+    575    3   59.144870000000020   41.530508046103905    1.596930000000000
+    576    1   59.144870000000020   43.376308403708523    0.000000000000000
+    577    2   38.364240000000013    0.000000000000000   -1.596930000000000
+    578    3   38.364240000000013    0.000000000000000    1.596930000000000
+    579    1   38.364240000000013    1.845800357604618    0.000000000000000
+    580    2   39.962750000000014    2.768700536406927   -1.596930000000000
+    581    3   39.962750000000014    2.768700536406927    1.596930000000000
+    582    1   39.962750000000014    4.614500894011545    0.000000000000000
+    583    2   41.561260000000014    5.537401072813854   -1.596930000000000
+    584    3   41.561260000000014    5.537401072813854    1.596930000000000
+    585    1   41.561260000000014    7.383201430418472    0.000000000000000
+    586    2   43.159770000000015    8.306101609220781   -1.596930000000000
+    587    3   43.159770000000015    8.306101609220781    1.596930000000000
+    588    1   43.159770000000015   10.151901966825399    0.000000000000000
+    589    2   44.758280000000015   11.074802145627708   -1.596930000000000
+    590    3   44.758280000000015   11.074802145627708    1.596930000000000
+    591    1   44.758280000000015   12.920602503232326    0.000000000000000
+    592    2   46.356790000000016   13.843502682034635   -1.596930000000000
+    593    3   46.356790000000016   13.843502682034635    1.596930000000000
+    594    1   46.356790000000016   15.689303039639253    0.000000000000000
+    595    2   47.955300000000017   16.612203218441562   -1.596930000000000
+    596    3   47.955300000000017   16.612203218441562    1.596930000000000
+    597    1   47.955300000000017   18.458003576046180    0.000000000000000
+    598    2   49.553810000000017   19.380903754848489   -1.596930000000000
+    599    3   49.553810000000017   19.380903754848489    1.596930000000000
+    600    1   49.553810000000017   21.226704112453107    0.000000000000000
+    601    2   51.152320000000018   22.149604291255416   -1.596930000000000
+    602    3   51.152320000000018   22.149604291255416    1.596930000000000
+    603    1   51.152320000000018   23.995404648860034    0.000000000000000
+    604    2   52.750830000000018   24.918304827662343   -1.596930000000000
+    605    3   52.750830000000018   24.918304827662343    1.596930000000000
+    606    1   52.750830000000018   26.764105185266961    0.000000000000000
+    607    2   54.349340000000019   27.687005364069270   -1.596930000000000
+    608    3   54.349340000000019   27.687005364069270    1.596930000000000
+    609    1   54.349340000000019   29.532805721673888    0.000000000000000
+    610    2   55.947850000000019   30.455705900476197   -1.596930000000000
+    611    3   55.947850000000019   30.455705900476197    1.596930000000000
+    612    1   55.947850000000019   32.301506258080815    0.000000000000000
+    613    2   57.546360000000020   33.224406436883124   -1.596930000000000
+    614    3   57.546360000000020   33.224406436883124    1.596930000000000
+    615    1   57.546360000000020   35.070206794487742    0.000000000000000
+    616    2   59.144870000000020   35.993106973290051   -1.596930000000000
+    617    3   59.144870000000020   35.993106973290051    1.596930000000000
+    618    1   59.144870000000020   37.838907330894669    0.000000000000000
+    619    2   60.743380000000021   38.761807509696978   -1.596930000000000
+    620    3   60.743380000000021   38.761807509696978    1.596930000000000
+    621    1   60.743380000000021   40.607607867301596    0.000000000000000
+    622    2   62.341890000000022   41.530508046103905   -1.596930000000000
+    623    3   62.341890000000022   41.530508046103905    1.596930000000000
+    624    1   62.341890000000022   43.376308403708523    0.000000000000000
+    625    2   41.561260000000014    0.000000000000000   -1.596930000000000
+    626    3   41.561260000000014    0.000000000000000    1.596930000000000
+    627    1   41.561260000000014    1.845800357604618    0.000000000000000
+    628    2   43.159770000000015    2.768700536406927   -1.596930000000000
+    629    3   43.159770000000015    2.768700536406927    1.596930000000000
+    630    1   43.159770000000015    4.614500894011545    0.000000000000000
+    631    2   44.758280000000015    5.537401072813854   -1.596930000000000
+    632    3   44.758280000000015    5.537401072813854    1.596930000000000
+    633    1   44.758280000000015    7.383201430418472    0.000000000000000
+    634    2   46.356790000000016    8.306101609220781   -1.596930000000000
+    635    3   46.356790000000016    8.306101609220781    1.596930000000000
+    636    1   46.356790000000016   10.151901966825399    0.000000000000000
+    637    2   47.955300000000017   11.074802145627708   -1.596930000000000
+    638    3   47.955300000000017   11.074802145627708    1.596930000000000
+    639    1   47.955300000000017   12.920602503232326    0.000000000000000
+    640    2   49.553810000000017   13.843502682034635   -1.596930000000000
+    641    3   49.553810000000017   13.843502682034635    1.596930000000000
+    642    1   49.553810000000017   15.689303039639253    0.000000000000000
+    643    2   51.152320000000018   16.612203218441562   -1.596930000000000
+    644    3   51.152320000000018   16.612203218441562    1.596930000000000
+    645    1   51.152320000000018   18.458003576046180    0.000000000000000
+    646    2   52.750830000000018   19.380903754848489   -1.596930000000000
+    647    3   52.750830000000018   19.380903754848489    1.596930000000000
+    648    1   52.750830000000018   21.226704112453107    0.000000000000000
+    649    2   54.349340000000019   22.149604291255416   -1.596930000000000
+    650    3   54.349340000000019   22.149604291255416    1.596930000000000
+    651    1   54.349340000000019   23.995404648860034    0.000000000000000
+    652    2   55.947850000000019   24.918304827662343   -1.596930000000000
+    653    3   55.947850000000019   24.918304827662343    1.596930000000000
+    654    1   55.947850000000019   26.764105185266961    0.000000000000000
+    655    2   57.546360000000020   27.687005364069270   -1.596930000000000
+    656    3   57.546360000000020   27.687005364069270    1.596930000000000
+    657    1   57.546360000000020   29.532805721673888    0.000000000000000
+    658    2   59.144870000000020   30.455705900476197   -1.596930000000000
+    659    3   59.144870000000020   30.455705900476197    1.596930000000000
+    660    1   59.144870000000020   32.301506258080815    0.000000000000000
+    661    2   60.743380000000021   33.224406436883124   -1.596930000000000
+    662    3   60.743380000000021   33.224406436883124    1.596930000000000
+    663    1   60.743380000000021   35.070206794487742    0.000000000000000
+    664    2   62.341890000000022   35.993106973290051   -1.596930000000000
+    665    3   62.341890000000022   35.993106973290051    1.596930000000000
+    666    1   62.341890000000022   37.838907330894669    0.000000000000000
+    667    2   63.940400000000022   38.761807509696978   -1.596930000000000
+    668    3   63.940400000000022   38.761807509696978    1.596930000000000
+    669    1   63.940400000000022   40.607607867301596    0.000000000000000
+    670    2   65.538910000000023   41.530508046103905   -1.596930000000000
+    671    3   65.538910000000023   41.530508046103905    1.596930000000000
+    672    1   65.538910000000023   43.376308403708523    0.000000000000000
+    673    2   44.758280000000015    0.000000000000000   -1.596930000000000
+    674    3   44.758280000000015    0.000000000000000    1.596930000000000
+    675    1   44.758280000000015    1.845800357604618    0.000000000000000
+    676    2   46.356790000000016    2.768700536406927   -1.596930000000000
+    677    3   46.356790000000016    2.768700536406927    1.596930000000000
+    678    1   46.356790000000016    4.614500894011545    0.000000000000000
+    679    2   47.955300000000017    5.537401072813854   -1.596930000000000
+    680    3   47.955300000000017    5.537401072813854    1.596930000000000
+    681    1   47.955300000000017    7.383201430418472    0.000000000000000
+    682    2   49.553810000000017    8.306101609220781   -1.596930000000000
+    683    3   49.553810000000017    8.306101609220781    1.596930000000000
+    684    1   49.553810000000017   10.151901966825399    0.000000000000000
+    685    2   51.152320000000018   11.074802145627708   -1.596930000000000
+    686    3   51.152320000000018   11.074802145627708    1.596930000000000
+    687    1   51.152320000000018   12.920602503232326    0.000000000000000
+    688    2   52.750830000000018   13.843502682034635   -1.596930000000000
+    689    3   52.750830000000018   13.843502682034635    1.596930000000000
+    690    1   52.750830000000018   15.689303039639253    0.000000000000000
+    691    2   54.349340000000019   16.612203218441562   -1.596930000000000
+    692    3   54.349340000000019   16.612203218441562    1.596930000000000
+    693    1   54.349340000000019   18.458003576046180    0.000000000000000
+    694    2   55.947850000000019   19.380903754848489   -1.596930000000000
+    695    3   55.947850000000019   19.380903754848489    1.596930000000000
+    696    1   55.947850000000019   21.226704112453107    0.000000000000000
+    697    2   57.546360000000020   22.149604291255416   -1.596930000000000
+    698    3   57.546360000000020   22.149604291255416    1.596930000000000
+    699    1   57.546360000000020   23.995404648860034    0.000000000000000
+    700    2   59.144870000000020   24.918304827662343   -1.596930000000000
+    701    3   59.144870000000020   24.918304827662343    1.596930000000000
+    702    1   59.144870000000020   26.764105185266961    0.000000000000000
+    703    2   60.743380000000021   27.687005364069270   -1.596930000000000
+    704    3   60.743380000000021   27.687005364069270    1.596930000000000
+    705    1   60.743380000000021   29.532805721673888    0.000000000000000
+    706    2   62.341890000000022   30.455705900476197   -1.596930000000000
+    707    3   62.341890000000022   30.455705900476197    1.596930000000000
+    708    1   62.341890000000022   32.301506258080815    0.000000000000000
+    709    2   63.940400000000022   33.224406436883124   -1.596930000000000
+    710    3   63.940400000000022   33.224406436883124    1.596930000000000
+    711    1   63.940400000000022   35.070206794487742    0.000000000000000
+    712    2   65.538910000000023   35.993106973290051   -1.596930000000000
+    713    3   65.538910000000023   35.993106973290051    1.596930000000000
+    714    1   65.538910000000023   37.838907330894669    0.000000000000000
+    715    2   67.137420000000023   38.761807509696978   -1.596930000000000
+    716    3   67.137420000000023   38.761807509696978    1.596930000000000
+    717    1   67.137420000000023   40.607607867301596    0.000000000000000
+    718    2   68.735930000000024   41.530508046103905   -1.596930000000000
+    719    3   68.735930000000024   41.530508046103905    1.596930000000000
+    720    1   68.735930000000024   43.376308403708523    0.000000000000000
+    721    2   47.955300000000017    0.000000000000000   -1.596930000000000
+    722    3   47.955300000000017    0.000000000000000    1.596930000000000
+    723    1   47.955300000000017    1.845800357604618    0.000000000000000
+    724    2   49.553810000000017    2.768700536406927   -1.596930000000000
+    725    3   49.553810000000017    2.768700536406927    1.596930000000000
+    726    1   49.553810000000017    4.614500894011545    0.000000000000000
+    727    2   51.152320000000018    5.537401072813854   -1.596930000000000
+    728    3   51.152320000000018    5.537401072813854    1.596930000000000
+    729    1   51.152320000000018    7.383201430418472    0.000000000000000
+    730    2   52.750830000000018    8.306101609220781   -1.596930000000000
+    731    3   52.750830000000018    8.306101609220781    1.596930000000000
+    732    1   52.750830000000018   10.151901966825399    0.000000000000000
+    733    2   54.349340000000019   11.074802145627708   -1.596930000000000
+    734    3   54.349340000000019   11.074802145627708    1.596930000000000
+    735    1   54.349340000000019   12.920602503232326    0.000000000000000
+    736    2   55.947850000000019   13.843502682034635   -1.596930000000000
+    737    3   55.947850000000019   13.843502682034635    1.596930000000000
+    738    1   55.947850000000019   15.689303039639253    0.000000000000000
+    739    2   57.546360000000020   16.612203218441562   -1.596930000000000
+    740    3   57.546360000000020   16.612203218441562    1.596930000000000
+    741    1   57.546360000000020   18.458003576046180    0.000000000000000
+    742    2   59.144870000000020   19.380903754848489   -1.596930000000000
+    743    3   59.144870000000020   19.380903754848489    1.596930000000000
+    744    1   59.144870000000020   21.226704112453107    0.000000000000000
+    745    2   60.743380000000021   22.149604291255416   -1.596930000000000
+    746    3   60.743380000000021   22.149604291255416    1.596930000000000
+    747    1   60.743380000000021   23.995404648860034    0.000000000000000
+    748    2   62.341890000000022   24.918304827662343   -1.596930000000000
+    749    3   62.341890000000022   24.918304827662343    1.596930000000000
+    750    1   62.341890000000022   26.764105185266961    0.000000000000000
+    751    2   63.940400000000022   27.687005364069270   -1.596930000000000
+    752    3   63.940400000000022   27.687005364069270    1.596930000000000
+    753    1   63.940400000000022   29.532805721673888    0.000000000000000
+    754    2   65.538910000000023   30.455705900476197   -1.596930000000000
+    755    3   65.538910000000023   30.455705900476197    1.596930000000000
+    756    1   65.538910000000023   32.301506258080815    0.000000000000000
+    757    2   67.137420000000023   33.224406436883124   -1.596930000000000
+    758    3   67.137420000000023   33.224406436883124    1.596930000000000
+    759    1   67.137420000000023   35.070206794487742    0.000000000000000
+    760    2   68.735930000000024   35.993106973290051   -1.596930000000000
+    761    3   68.735930000000024   35.993106973290051    1.596930000000000
+    762    1   68.735930000000024   37.838907330894669    0.000000000000000
+    763    2   70.334440000000024   38.761807509696978   -1.596930000000000
+    764    3   70.334440000000024   38.761807509696978    1.596930000000000
+    765    1   70.334440000000024   40.607607867301596    0.000000000000000
+    766    2   71.932950000000025   41.530508046103905   -1.596930000000000
+    767    3   71.932950000000025   41.530508046103905    1.596930000000000
+    768    1   71.932950000000025   43.376308403708523    0.000000000000000
diff --git a/examples/threebody/tmd.sw.mod b/examples/threebody/tmd.sw.mod
new file mode 120000
index 0000000000..0affacdd40
--- /dev/null
+++ b/examples/threebody/tmd.sw.mod
@@ -0,0 +1 @@
+../../potentials/tmd.sw.mod
\ No newline at end of file
diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps
index f52bd07fcf..d7820e4c34 100644
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------
-#  Generic Linux Makefile for CUDA
+#  Generic Linux Makefile for CUDA with the Multi-Process Service (MPS)
 #     - change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */
 
diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip
index a736988596..d5391f7d6b 100644
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@@ -39,11 +39,9 @@ HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
 HIP_COMPILER=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
 
 ifeq (hcc,$(HIP_PLATFORM))
-	HIP_OPTS  += -ffast-math
 	# possible values: gfx803,gfx900,gfx906
 	HIP_ARCH = gfx906
 else ifeq (amd,$(HIP_PLATFORM))
-	HIP_OPTS  += -ffast-math
 	# possible values: gfx803,gfx900,gfx906
 	HIP_ARCH = gfx906
 else ifeq (nvcc,$(HIP_PLATFORM))
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index 0b3084cbe9..bed6848980 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
+#  Generic Linux Makefile for CUDA
 #     - Change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */
 
@@ -13,7 +13,7 @@ endif
 
 NVCC = nvcc
 
-# obsolete hardware. not supported by current drivers anymore.
+# obsolete hardware. not supported by current drivers and toolkits anymore.
 #CUDA_ARCH = -arch=sm_13
 #CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
 
@@ -28,11 +28,11 @@ NVCC = nvcc
 #CUDA_ARCH = -arch=sm_37
 
 # Maxwell hardware
-CUDA_ARCH = -arch=sm_50
+#CUDA_ARCH = -arch=sm_50
 #CUDA_ARCH = -arch=sm_52
 
 # Pascal hardware
-#CUDA_ARCH = -arch=sm_60
+CUDA_ARCH = -arch=sm_60
 #CUDA_ARCH = -arch=sm_61
 
 # Volta hardware
@@ -70,7 +70,7 @@ LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
 
-# GPU binning not recommended with modern GPUs
+# GPU binning not recommended for most modern GPUs
 CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini
 
 include Nvidia.makefile
diff --git a/lib/gpu/Makefile.linux_multi b/lib/gpu/Makefile.linux_multi
index 05b869879e..f3d89fd9f0 100644
--- a/lib/gpu/Makefile.linux_multi
+++ b/lib/gpu/Makefile.linux_multi
@@ -1,6 +1,6 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
-#     - Change CUDA_ARCH for your GPU
+#  Generic Linux Makefile for CUDA complied for multiple compute capabilities
+#     - Add your GPU to CUDA_CODE
 # ------------------------------------------------------------------------- */
 
 # which file will be copied to Makefile.lammps
diff --git a/lib/gpu/Makefile.mpi b/lib/gpu/Makefile.mpi
new file mode 120000
index 0000000000..8bad27d081
--- /dev/null
+++ b/lib/gpu/Makefile.mpi
@@ -0,0 +1 @@
+Makefile.linux
\ No newline at end of file
diff --git a/lib/gpu/Makefile.serial b/lib/gpu/Makefile.serial
index d24b03f8d6..6c94911f32 100644
--- a/lib/gpu/Makefile.serial
+++ b/lib/gpu/Makefile.serial
@@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
+#  Generic Linux Makefile for CUDA without MPI libraries
 #     - Change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */
 
@@ -28,11 +28,11 @@ NVCC = nvcc
 #CUDA_ARCH = -arch=sm_37
 
 # Maxwell hardware
-CUDA_ARCH = -arch=sm_50
+#CUDA_ARCH = -arch=sm_50
 #CUDA_ARCH = -arch=sm_52
 
 # Pascal hardware
-#CUDA_ARCH = -arch=sm_60
+CUDA_ARCH = -arch=sm_60
 #CUDA_ARCH = -arch=sm_61
 
 # Volta hardware
@@ -41,6 +41,10 @@ CUDA_ARCH = -arch=sm_50
 # Turing hardware
 #CUDA_ARCH = -arch=sm_75
 
+# Ampere hardware
+#CUDA_ARCH = -arch=sm_80
+#CUDA_ARCH = -arch=sm_86
+
 # this setting should match LAMMPS Makefile
 # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
 
diff --git a/lib/gpu/Makefile.turing b/lib/gpu/Makefile.turing
deleted file mode 100644
index 390de9c558..0000000000
--- a/lib/gpu/Makefile.turing
+++ /dev/null
@@ -1,23 +0,0 @@
-NVCC  = $(CUDA_HOME)/bin/nvcc
-EXTRAMAKE = Makefile.lammps.standard
-
-CUDA_ARCH = -arch=sm_75
-CUDA_PRECISION = -D_SINGLE_DOUBLE
-CUDA_INCLUDE = -I$(CUDA_HOME)/include
-CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 -lcudart
-CUDA_OPTS = -DUNIX -O3 --use_fast_math --ftz=true
-
-CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include
-CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON -DLAMMPS_SMALLBIG
-
-BIN_DIR = .
-OBJ_DIR = obj
-LIB_DIR = .
-AR = ar
-BSH = /bin/sh
-
-# GPU binning not recommended with most modern GPUs
-CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini
-
-include Nvidia.makefile
-
diff --git a/lib/kim/Install.py b/lib/kim/Install.py
index ae4b356ba9..da0a4296d6 100644
--- a/lib/kim/Install.py
+++ b/lib/kim/Install.py
@@ -17,6 +17,8 @@ parser = ArgumentParser(prog='Install.py',
 
 # settings
 
+CMAKE = os.environ.get('CMAKE') or 'cmake'
+
 thisdir = fullpath('.')
 version = "2.2.1"
 
@@ -141,7 +143,7 @@ if buildflag:
   # configure kim-api
 
   print("Configuring kim-api ...")
-  cmd = 'cd "%s/kim-api-%s" && mkdir build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX="%s" -DCMAKE_BUILD_TYPE=Release' % (thisdir,version,kimdir)
+  cmd = 'cd "%s/kim-api-%s" && mkdir build && cd build && %s .. -DCMAKE_INSTALL_PREFIX="%s" -DCMAKE_BUILD_TYPE=Release' % (thisdir,version,CMAKE,kimdir)
   txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
   if verboseflag: print(txt.decode("UTF-8"))
 
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 7bb6de4cd9..2e779791dd 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,165 @@
 # Change Log
 
+## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
+
+### Features:
+
+- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
+- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
+- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
+- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
+- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
+- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
+- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
+- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
+- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
+- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
+- Significant SYCL enhancements (see below)
+
+### Deprecations:
+
+- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
+- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
+- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
+- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
+- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
+- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
+
+
+### Backends and Archs Enhancements:
+
+- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
+- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
+- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
+- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
+- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
+- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
+- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
+- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
+
+#### SYCL:
+- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
+- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
+- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
+- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
+- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
+- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
+- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
+- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
+- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
+- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
+- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
+- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
+- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
+
+#### CUDA:
+- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
+- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
+- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
+- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
+- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
+- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
+- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
+- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
+- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
+
+#### HIP:
+- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
+- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
+- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
+
+#### Serial:
+- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
+
+#### OpenMPTarget:
+- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
+- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
+- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
+- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
+
+### Implemented enhancements BuildSystem
+
+#### Important BuildSystem Updates:
+- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
+- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
+
+#### Other Improvements:
+- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
+- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
+- Adds support for -time=<file> and -time <file> in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
+- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
+- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
+- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
+- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
+- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
+- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
+- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
+- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
+- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
+- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
+- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
+- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
+- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
+- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
+
+### Other Changes:
+
+#### Tool Enhancements:
+
+- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
+- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
+- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
+
+
+#### Miscellaneous:
+
+- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
+- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
+- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
+- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
+- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
+
+### Incompatibilities:
+
+- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
+
+### Bug Fixes:
+- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
+- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
+- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
+- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
+- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
+- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
+- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
+- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
+- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
+- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
+- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
+- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
+- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
+- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
+- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
+- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
+- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
+- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
+- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
+- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
+- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
+- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
+- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
+- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
+- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
+- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
+- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
+- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
+- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
+- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
+- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
+- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
+- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
+
 ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
 
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index 9452027d8e..1b6753f983 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -111,8 +111,8 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 5)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS)
   # which needs another workaround.
   SET(KOKKOS_COMPILE_OPTIONS_TMP)
   FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
-    LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
+    IF(OPTION_HAS_WHITESPACE EQUAL -1)
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
+    ELSE()
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
+    ENDIF()
   ENDFOREACH()
   STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
   LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 2a984eefb6..7ffea5a62c 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -11,20 +11,21 @@ CXXFLAGS += $(SHFLAGS)
 endif
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 5
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
-# Options: Cuda,HIP,OpenMP,Pthread,Serial
+# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthread"
-# Options: 
+# Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega900,Vega906,Vega908
+# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
+# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@@ -32,8 +33,8 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++14"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
-KOKKOS_OPTIONS ?= ""
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings,enable_desul_atomics
+KOKKOS_OPTIONS ?= "enable_desul_atomics"
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
 KOKKOS_STANDALONE_CMAKE ?= "no"
@@ -80,7 +81,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
 
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
-KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
@@ -92,6 +93,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
+KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)
 
 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
 
@@ -112,6 +116,7 @@ endif
 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
 KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
+KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
 
 KOKKOS_DEVICELIST =
@@ -133,11 +138,18 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   KOKKOS_DEVICELIST += HIP
 endif
+KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  KOKKOS_DEVICELIST += SYCL
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error SYCL backend requires C++17 or newer)
+  endif
+
+endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   KOKKOS_DEVICELIST += OPENMPTARGET
-  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
   ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
     $(error OpenMPTarget backend requires C++17 or newer)
   endif
@@ -168,6 +180,8 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_CRAY_CLANG  := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
+KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
@@ -247,7 +261,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    else
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    endif
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
       KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -259,7 +277,11 @@ else
           # OpenMP is turned on by default in Cray compiler environment.
           KOKKOS_INTERNAL_OPENMP_FLAG :=
         else
-          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
+          else
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          endif
         endif
       endif
     endif
@@ -317,6 +339,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
 
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
@@ -384,20 +413,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
-KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
+    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+  endif
+endif
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
 KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
+KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
 
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@@ -406,7 +440,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc )
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@@ -442,6 +476,10 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 
+#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  KOKKOS_LIBS += -latomic
+#endif
+
 # Generating the KokkosCore_config.h file.
 
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@@ -478,6 +516,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
   ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
@@ -533,6 +575,12 @@ endif
 
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
   KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
@@ -635,8 +683,10 @@ endif
 
 tmp := $(call kokkos_append_header,"/* Optimization Settings */")
 
-ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
+  # deprecated
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
 endif
 
 tmp := $(call kokkos_append_header,"/* Cuda Settings */")
@@ -1166,6 +1216,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
     KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+  endif
 
 
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
@@ -1184,6 +1239,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   endif
 endif
 
+# Figure out the architecture flag for SYCL.
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
+  endif
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
+
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=-fsycl
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+endif
 
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 
@@ -1196,56 +1297,62 @@ endif
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
 
-# Functions for generating config header file
-kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
-kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
-kokkos_append_config_header = $(shell echo $1 >> $2))
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
-tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
-   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-   else
-   endif
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
+  # Functions for generating config header file
+  kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
+  kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
+  kokkos_append_config_header = $(shell echo $1 >> $2))
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    else
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
 endif
+
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
@@ -1257,6 +1364,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+  endif
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
     KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index cf9fc24242..93854d0cf1 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md
index d55ef2caac..673f462712 100644
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides
 abstractions for both parallel execution of code and data management.
 Kokkos is designed to target complex node architectures with N-level memory
 hierarchies and multiple types of execution resources. It currently can use
-CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
+CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.
 
 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
@@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 # Learning about Kokkos
 
-A programming guide can be found on the Wiki, the API reference is under development.
+The best way to start learning about Kokkos is going through the Kokkos Lectures.
+They are online available at https://kokkos.link/the-lectures and contain a mix
+of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
+capabilities.
+
+A programming guide and API reference can be found on the Wiki
+(https://github.com/kokkos/kokkos/wiki).
 
 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
 
-A separate repository with extensive tutorial material can be found under
-https://github.com/kokkos/kokkos-tutorials.
-
-Furthermore, the 'example/tutorial' directory provides step by step tutorial
-examples which explain many of the features of Kokkos. They work with
-simple Makefiles. To build with g++ and OpenMP simply type 'make'
-in the 'example/tutorial' directory. This will build all examples in the
-subfolders. To change the build options refer to the Programming Guide
-in the compilation section.
-
-To learn more about Kokkos consider watching one of our presentations:
-* GTC 2015:
-  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
-  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
-
-
 # Contributing to Kokkos
 
 We are open and try to encourage contributions from external developers.
@@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution.
 
 # Requirements
 
-### Primary tested compilers on X86 are:
-* GCC 5.3.0
-* GCC 5.4.0
-* GCC 5.5.0
-* GCC 6.1.0
-* GCC 7.2.0
-* GCC 7.3.0
-* GCC 8.1.0
-* Intel 17.0.1
-* Intel 17.4.196
-* Intel 18.2.128
-* Clang 4.0.0
-* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
-* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
-* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2)
-* PGI 18.7
-* NVCC 9.1 for CUDA (with gcc 6.1.0)
-* NVCC 9.2 for CUDA (with gcc 7.2.0)
-* NVCC 10.0 for CUDA (with gcc 7.4.0)
-* NVCC 10.1 for CUDA (with gcc 7.4.0)
-* NVCC 11.0 for CUDA (with gcc 8.4.0)
+### Minimum Compiler Versions
 
-### Primary tested compilers on Power 8 are:
-* GCC 6.4.0 (OpenMP,Serial)
-* GCC 7.2.0 (OpenMP,Serial)
-* IBM XL 16.1.0 (OpenMP, Serial)
-* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
+Generally Kokkos should work with all compiler versions newer than the minimum.
+However as in all sufficiently complex enough code, we have to work around compiler
+bugs with almost all compilers. So compiler versions we don't test may have issues
+we are unaware off.
 
-### Primary tested compilers on Intel KNL are:
-* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0)
-* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0)
+* GCC: 5.3.0
+* Clang: 4.0.0
+* Intel: 17.0.1
+* NVCC: 9.2.88
+* NVC++: 21.5
+* ROCM: 4.3
+* MSVC: 19.29
+* IBM XL: 16.1.1
+* Fujitsu: 4.5.0
+* ARM/Clang 20.1
 
-### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0
-* ARM/Clang 18.4.0
+### Primary Tested Compilers
 
-### Other compilers working:
-* X86:
-    * Cygwin 2.1.0 64bit with gcc 4.9.3
-    * GCC 8.1.0 (not warning free)
-
-### Known non-working combinations:
-* Power8:
-    * Pthreads backend
-* ARM
-    * Pthreads backend
+* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
+* NVCC: 9.2.88, 10.1, 11.0
+* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
+* Intel 17.4, 18.1, 19.5
+* MSVC: 19.29
+* ARM/Clang: 20.1
+* IBM XL: 16.1.1
+* ROCM: 4.3.0
 
 ### Build system:
-* CMake >= 3.10: required
-* CMake >= 3.13: recommended
+
+* CMake >= 3.16: required
 * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
+* CMake >= 3.21.1 for NVC++
 
 Primary tested compiler are passing in release mode
 with warnings as errors. They also are tested with a comprehensive set of
@@ -153,7 +126,6 @@ cmake $srcdir \
   -DCMAKE_INSTALL_PREFIX=$path_to_install \
   -DKokkos_ENABLE_OPENMP=On \
   -DKokkos_ARCH_HSW=On \
-  -DKokkos_ENABLE_HWLOC=On \
   -DKokkos_HWLOC_DIR=$path_to_hwloc
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
@@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a
 Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
 
 ## Raw Makefile
-A bash script is provided to generate raw makefiles.
-To install Kokkos as a library create a build directory and run the following
-````bash
-> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
-````
-Once the Makefile is generated, run:
-````bash
-> make kokkoslib
-> make install
-````
-To additionally run the unit tests:
-````bash
-> make build-test
-> make test
-````
-Run `generate_makefile.bash --help` for more detailed options such as
-changing the device type for which to build.
+
+Raw Makefiles are only supported via inline builds. See below.
 
 ## Inline Builds vs. Installed Package
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
@@ -268,6 +225,35 @@ more than a single GPU is used by a single process.
 
 If you publish work which mentions Kokkos, please cite the following paper:
 
+````BibTex
+@ARTICLE{9485033,
+  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
+  year={2022},
+  volume={33},
+  number={4},
+  pages={805-817},
+  doi={10.1109/TPDS.2021.3097283}}
+````
+
+If you use more than one Kokkos EcoSystem package, please also cite:
+
+````BibTex
+@ARTICLE{9502936,
+  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
+  journal={Computing in Science   Engineering},
+  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
+  year={2021},
+  volume={23},
+  number={5},
+  pages={10-18},
+  doi={10.1109/MCSE.2021.3098509}}
+````
+
+
+And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
+
 ````BibTeX
 @article{CarterEdwards20143202,
   title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt
index 4df76a1dbb..eb54db8a55 100644
--- a/lib/kokkos/algorithms/CMakeLists.txt
+++ b/lib/kokkos/algorithms/CMakeLists.txt
@@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
-IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
-        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
-             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
   KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 55ce19971f..46b8ab87fa 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
   }
 };
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <class Generator>
+struct rand<Generator, Kokkos::Experimental::half_t> {
+  using half = Kokkos::Experimental::half_t;
+  KOKKOS_INLINE_FUNCTION
+  static half max() { return half(1.0); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen) { return half(gen.frand()); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& range) {
+    return half(gen.frand(float(range)));
+  }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& start, const half& end) {
+    return half(gen.frand(float(start), float(end)));
+  }
+};
+#endif  // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
 template <class Generator>
 struct rand<Generator, float> {
   KOKKOS_INLINE_FUNCTION
@@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
 
 template <class ExecutionSpace>
 struct Random_UniqueIndex {
-  using locks_view_type = View<int*, ExecutionSpace>;
+  using locks_view_type = View<int**, ExecutionSpace>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@@ -615,7 +634,7 @@ struct Random_UniqueIndex {
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
 struct Random_UniqueIndex<Kokkos::Cuda> {
-  using locks_view_type = View<int*, Kokkos::Cuda>;
+  using locks_view_type = View<int**, Kokkos::Cuda>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __CUDA_ARCH__
@@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_HIP
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
-  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
@@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 #ifdef KOKKOS_ENABLE_SYCL
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
-  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
-#ifdef KOKKOS_ARCH_INTEL_GEN
+#ifdef KOKKOS_ARCH_INTEL_GPU
     int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
 #else
     int i = 0;
 #endif
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i = (i + 1) % static_cast<int>(locks_.extent(0));
     }
     return i;
@@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
-  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks) {
     const int team_size = omp_get_num_threads();
     int i               = omp_get_team_num() * team_size + omp_get_thread_num();
     const int lock_size = locks.extent_int(0);
 
-    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
       i = (i + 1) % lock_size;
     }
     return i;
@@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using state_data_type = View<uint64_t*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using state_data_type = View<uint64_t**, DeviceType>;
   locks_type locks_;
   state_data_type state_;
   int num_states_;
+  int padding_;
 
  public:
   using generator_type = Random_XorShift64<DeviceType>;
   using device_type    = DeviceType;
 
   KOKKOS_INLINE_FUNCTION
-  Random_XorShift64_Pool() { num_states_ = 0; }
+  Random_XorShift64_Pool() {
+    num_states_ = 0;
+    padding_    = 0;
+  }
   Random_XorShift64_Pool(uint64_t seed) {
     num_states_ = 0;
 
@@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
     locks_      = src.locks_;
     state_      = src.state_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
-
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
 
-    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
-    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
+    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
+                             padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
         gen(seed, 0);
     for (int i = 0; i < 17; i++) gen.rand();
     for (int i = 0; i < num_states_; i++) {
-      int n1     = gen.rand();
-      int n2     = gen.rand();
-      int n3     = gen.rand();
-      int n4     = gen.rand();
-      h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
-                   (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
-                   (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
-                   (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
-      h_lock(i) = 0;
+      int n1        = gen.rand();
+      int n2        = gen.rand();
+      int n3        = gen.rand();
+      int n4        = gen.rand();
+      h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
+                      (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
+                      (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
+                      (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
   Random_XorShift64<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift64<DeviceType>(state_(i), i);
+    return Random_XorShift64<DeviceType>(state_(i, 0), i);
   }
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift64<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
+    return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift64<DeviceType>& state) const {
-    state_(state.state_idx_) = state.state_;
-    locks_(state.state_idx_) = 0;
+    state_(state.state_idx_, 0) = state.state_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
@@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using int_view_type   = View<int*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using int_view_type   = View<int**, DeviceType>;
   using state_data_type = View<uint64_t * [16], DeviceType>;
 
   locks_type locks_;
   state_data_type state_;
   int_view_type p_;
   int num_states_;
+  int padding_;
   friend class Random_XorShift1024<DeviceType>;
 
  public:
@@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
     state_      = src.state_;
     p_          = src.p_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   inline void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
-    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
     state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
-    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
+    p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
                         (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
                         (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
       }
-      h_p(i)    = 0;
-      h_lock(i) = 0;
+      h_p(i, 0)    = 0;
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
   Random_XorShift1024<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
+    return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
   };
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
+    return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift1024<DeviceType>& state) const {
     for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_)     = state.p_;
-    locks_(state.state_idx_) = 0;
+    p_(state.state_idx_, 0)     = state.p_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
index d17c02776f..9c2e8b978b 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -206,8 +206,10 @@ class BinSort {
   //----------------------------------------
   // Constructor: takes the keys, the binning_operator and optionally whether to
   // sort within bins (default false)
-  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
-          BinSortOp bin_op_, bool sort_within_bins_ = false)
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          int range_begin_, int range_end_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
       : keys(keys_),
         keys_rnd(keys_),
         bin_op(bin_op_),
@@ -222,50 +224,63 @@ class BinSort {
         "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
     bin_count_const = bin_count_atomic;
     bin_offsets =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                                "Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
                     bin_op.max_bins());
     sort_order =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                                "Kokkos::SortImpl::BinSortFunctor::sort_order"),
                     range_end - range_begin);
   }
 
+  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
+                sort_within_bins_) {}
+
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+
   BinSort(const_key_view_type keys_, BinSortOp bin_op_,
           bool sort_within_bins_ = false)
-      : BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+      : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}
 
   //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count
   // array. Can be called again if keys changed
-  void create_permute_vector() {
+  template <class ExecutionSpace = execution_space>
+  void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
     const size_t len = range_end - range_begin;
     Kokkos::parallel_for(
         "Kokkos::Sort::BinCount",
-        Kokkos::RangePolicy<execution_space, bin_count_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
+        *this);
     Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
-                          Kokkos::RangePolicy<execution_space, bin_offset_tag>(
-                              0, bin_op.max_bins()),
+                          Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
+                              exec, 0, bin_op.max_bins()),
                           *this);
 
-    Kokkos::deep_copy(bin_count_atomic, 0);
+    Kokkos::deep_copy(exec, bin_count_atomic, 0);
     Kokkos::parallel_for(
         "Kokkos::Sort::BinBinning",
-        Kokkos::RangePolicy<execution_space, bin_binning_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
+        *this);
 
     if (sort_within_bins)
       Kokkos::parallel_for(
           "Kokkos::Sort::BinSort",
-          Kokkos::RangePolicy<execution_space, bin_sort_bins_tag>(
-              0, bin_op.max_bins()),
+          Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
+              exec, 0, bin_op.max_bins()),
           *this);
   }
 
   // Sort a subset of a view with respect to the first dimension using the
   // permutation array
-  template <class ValuesViewType>
-  void sort(ValuesViewType const& values, int values_range_begin,
-            int values_range_end) const {
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
+            int values_range_begin, int values_range_end) const {
     using scratch_view_type =
         Kokkos::View<typename ValuesViewType::data_type,
                      typename ValuesViewType::array_layout,
@@ -279,7 +294,7 @@ class BinSort {
     }
 
     scratch_view_type sorted_values(
-        view_alloc(WithoutInitializing,
+        view_alloc(exec, WithoutInitializing,
                    "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
         values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         values.rank_dynamic > 1 ? values.extent(1)
@@ -308,7 +323,7 @@ class BinSort {
                   values_range_begin - range_begin);
 
       parallel_for("Kokkos::Sort::CopyPermute",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
     }
 
     {
@@ -316,10 +331,23 @@ class BinSort {
           values, range_begin, sorted_values);
 
       parallel_for("Kokkos::Sort::Copy",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
     }
+  }
 
-    execution_space().fence();
+  // Sort a subset of a view with respect to the first dimension using the
+  // permutation array
+  template <class ValuesViewType>
+  void sort(ValuesViewType const& values, int values_range_begin,
+            int values_range_end) const {
+    execution_space exec;
+    sort(exec, values, values_range_begin, values_range_end);
+    exec.fence("Kokkos::Sort: fence after sorting");
+  }
+
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
+    this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
   }
 
   template <class ValuesViewType>
@@ -485,17 +513,19 @@ struct BinOp3D {
 
 namespace Impl {
 
-template <class ViewType>
-bool try_std_sort(ViewType view) {
+template <class ViewType, class ExecutionSpace>
+bool try_std_sort(ViewType view, const ExecutionSpace& exec) {
   bool possible    = true;
   size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(),
                       view.stride_3(), view.stride_4(), view.stride_5(),
                       view.stride_6(), view.stride_7()};
   possible         = possible &&
-             std::is_same<typename ViewType::memory_space, HostSpace>::value;
+             SpaceAccessibility<HostSpace,
+                                typename ViewType::memory_space>::accessible;
   possible = possible && (ViewType::Rank == 1);
   possible = possible && (stride[0] == 1);
   if (possible) {
+    exec.fence("Kokkos::sort: Fence before sorting on the host");
     std::sort(view.data(), view.data() + view.extent(0));
   }
   return possible;
@@ -518,10 +548,12 @@ struct min_max_functor {
 
 }  // namespace Impl
 
-template <class ViewType>
-void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType const& view,
+    bool const always_use_kokkos_sort = false) {
   if (!always_use_kokkos_sort) {
-    if (Impl::try_std_sort(view)) return;
+    if (Impl::try_std_sort(view, exec)) return;
   }
   using CompType = BinOp1D<ViewType>;
 
@@ -529,34 +561,50 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
   Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
   parallel_reduce("Kokkos::Sort::FindExtent",
                   Kokkos::RangePolicy<typename ViewType::execution_space>(
-                      0, view.extent(0)),
+                      exec, 0, view.extent(0)),
                   Impl::min_max_functor<ViewType>(view), reducer);
   if (result.min_val == result.max_val) return;
   BinSort<ViewType, CompType> bin_sort(
       view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true);
-  bin_sort.create_permute_vector();
-  bin_sort.sort(view);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view);
 }
 
 template <class ViewType>
-void sort(ViewType view, size_t const begin, size_t const end) {
+void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, always_use_kokkos_sort);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType view, size_t const begin,
+    size_t const end) {
   using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
   using CompType     = BinOp1D<ViewType>;
 
   Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
   Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
 
-  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end),
+  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
                   Impl::min_max_functor<ViewType>(view), reducer);
 
   if (result.min_val == result.max_val) return;
 
   BinSort<ViewType, CompType> bin_sort(
-      view, begin, end,
+      exec, view, begin, end,
       CompType((end - begin) / 2, result.min_val, result.max_val), true);
 
-  bin_sort.create_permute_vector();
-  bin_sort.sort(view, begin, end);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view, begin, end);
+}
+
+template <class ViewType>
+void sort(ViewType view, size_t const begin, size_t const end) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, begin, end);
+  exec.fence("Kokkos::Sort: fence after sorting");
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
index c37e779c99..3dffce7df4 100644
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -47,7 +47,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
@@ -198,11 +198,50 @@ struct test_random_functor {
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
       const uint64_t ind3_3d =
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
-
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+//
+// 17.0.4 for 64bit Random works with 1/1/1/2/1
+// 17.0.4 for 1024bit Random works with 1/1/1/1/1
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind1_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind2_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind3_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
+        asm volatile("nop\n");
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
     }
     rand_pool.free_state(rand_gen);
   }
@@ -338,9 +377,11 @@ struct test_random_scalar {
       using functor_type =
           test_histogram1d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
-
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
+      double mean_eps_expect       = 0.0001;
+      double variance_eps_expect   = 0.07;
+      double covariance_eps_expect = 0.06;
+      double tolerance             = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect           = 1.0 * num_draws * 3 / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@@ -349,11 +390,26 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
-      pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
-      pass_hist1d_var =
-          ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
-      pass_hist1d_covar =
-          ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        mean_eps_expect       = 0.0003;
+        variance_eps_expect   = 1.0;
+        covariance_eps_expect = 5.0e4;
+      }
+#endif
+
+      pass_hist1d_mean =
+          ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
+                                                                          : 0;
+      pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
+                         (variance_eps_expect > variance_eps))
+                            ? 1
+                            : 0;
+      pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
+                           (covariance_eps_expect > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
            << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@@ -371,8 +427,9 @@ struct test_random_scalar {
           test_histogram3d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
 
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws / HIST_DIM1D;
+      double variance_factor = 1.2;
+      double tolerance       = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect     = 1.0 * num_draws / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@@ -381,15 +438,23 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        variance_factor = 7;
+      }
+#endif
+
       pass_hist3d_mean =
           ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
-      pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
-                         (1.2 * tolerance > variance_eps))
+      pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
+                         (variance_factor * tolerance > variance_eps))
                             ? 1
                             : 0;
-      pass_hist3d_covar =
-          ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
-                                                                          : 0;
+      pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
+                           (variance_factor * tolerance > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
            << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
   deep_copy(density_1d, 0);
   deep_copy(density_3d, 0);
 
+  cout << "Test Scalar=half" << endl;
+  test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
+      density_1d, density_3d, pool, num_draws);
+  ASSERT_EQ(test_half.pass_mean, 1);
+  ASSERT_EQ(test_half.pass_var, 1);
+  ASSERT_EQ(test_half.pass_covar, 1);
+  ASSERT_EQ(test_half.pass_hist1d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist1d_var, 1);
+  ASSERT_EQ(test_half.pass_hist1d_covar, 1);
+  ASSERT_EQ(test_half.pass_hist3d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist3d_var, 1);
+  ASSERT_EQ(test_half.pass_hist3d_covar, 1);
+  deep_copy(density_1d, 0);
+  deep_copy(density_3d, 0);
+
   cout << "Test Scalar=float" << endl;
   test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
                                                         pool, num_draws);
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
index 9c6308c843..de1e6b3c31 100644
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -135,8 +135,9 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   KeyViewType keys("Keys", n);
 
   // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys, KeyType(1));
-  Kokkos::sort(keys, force_kokkos);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys, KeyType(1));
+  Kokkos::sort(exec, keys, force_kokkos);
 
   Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
   Kokkos::fill_random(keys, g,
@@ -147,13 +148,16 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_before);
 
-  Kokkos::sort(keys, force_kokkos);
+  Kokkos::sort(exec, keys, force_kokkos);
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys),
+                          sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -177,8 +181,10 @@ void test_3D_sort_impl(unsigned int n) {
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_before);
+  ExecutionSpace exec;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_before);
 
   int bin_1d = 1;
   while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
@@ -189,15 +195,17 @@ void test_3D_sort_impl(unsigned int n) {
   using BinOp = Kokkos::BinOp3D<KeyViewType>;
   BinOp bin_op(bin_max, min, max);
   Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
-  Sorter.create_permute_vector();
-  Sorter.template sort<KeyViewType>(keys);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, keys);
 
-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_after);
-  Kokkos::parallel_reduce(keys.extent(0) - 1,
-                          bin3d_is_sorted_struct<ExecutionSpace, KeyType>(
-                              keys, bin_1d, min[0], max[0]),
-                          sort_fails);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
+      bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
+                                                      max[0]),
+      sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -229,36 +237,36 @@ void test_dynamic_view_sort_impl(unsigned int n) {
   KeyViewType keys_view("KeysTmp", n);
 
   // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys_view, KeyType(1));
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys_view, KeyType(1));
   Kokkos::deep_copy(keys, keys_view);
-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);
 
   Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
   Kokkos::fill_random(keys_view, g,
                       Kokkos::Random_XorShift64_Pool<
                           ExecutionSpace>::generator_type::MAX_URAND);
 
-  ExecutionSpace().fence();
+  exec.fence();
   Kokkos::deep_copy(keys, keys_view);
-  // ExecutionSpace().fence();
 
   double sum_before       = 0.0;
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_before);
 
-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);
 
-  ExecutionSpace().fence();  // Need this fence to prevent BusError with Cuda
+  exec.fence();  // Need this fence to prevent BusError with Cuda
   Kokkos::deep_copy(keys_view, keys);
-  // ExecutionSpace().fence();
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys_view), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys_view),
+                          sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -301,9 +309,10 @@ void test_issue_1160_impl() {
   for (int i = 0; i < 10; ++i) {
     h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
   }
-  Kokkos::deep_copy(element_, h_element);
-  Kokkos::deep_copy(x_, h_x);
-  Kokkos::deep_copy(v_, h_v);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, element_, h_element);
+  Kokkos::deep_copy(exec, x_, h_x);
+  Kokkos::deep_copy(exec, v_, h_v);
 
   using KeyViewType = decltype(element_);
   using BinOp       = Kokkos::BinOp1D<KeyViewType>;
@@ -316,15 +325,16 @@ void test_issue_1160_impl() {
 
   Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
                                              false);
-  Sorter.create_permute_vector();
-  Sorter.sort(element_, begin, end);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, element_, begin, end);
 
-  Sorter.sort(x_, begin, end);
-  Sorter.sort(v_, begin, end);
+  Sorter.sort(exec, x_, begin, end);
+  Sorter.sort(exec, v_, begin, end);
 
-  Kokkos::deep_copy(h_element, element_);
-  Kokkos::deep_copy(h_x, x_);
-  Kokkos::deep_copy(h_v, v_);
+  Kokkos::deep_copy(exec, h_element, element_);
+  Kokkos::deep_copy(exec, h_x, x_);
+  Kokkos::deep_copy(exec, h_v, v_);
+  exec.fence();
 
   ASSERT_EQ(h_element(0), 9);
   ASSERT_EQ(h_element(1), 8);
diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml
index e8763c0b66..73a0d31875 100644
--- a/lib/kokkos/appveyor.yml
+++ b/lib/kokkos/appveyor.yml
@@ -3,4 +3,8 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
+    mkdir build &&
+    cd build &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake --build . --target install &&
+    ctest -C Debug --output-on-failure
diff --git a/lib/kokkos/benchmarks/atomic/main.cpp b/lib/kokkos/benchmarks/atomic/main.cpp
index 7b5caa1aee..cc0d3e41e8 100644
--- a/lib/kokkos/benchmarks/atomic/main.cpp
+++ b/lib/kokkos/benchmarks/atomic/main.cpp
@@ -1,12 +1,12 @@
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Random.hpp>
 
 template <class Scalar>
 double test_atomic(int L, int N, int M, int K, int R,
                    Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
@@ -28,7 +28,7 @@ template <class Scalar>
 double test_no_atomic(int L, int N, int M, int K, int R,
                       Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
         L, KOKKOS_LAMBDA(const int& i) {
diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
index 62d7ef4a4c..4fc6ca2c68 100644
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 template <class Scalar, int Unroll, int Stride>
 struct Run {
diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
index 6da2407a08..75f30a3409 100644
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <bench.hpp>
 #include <cstdlib>
 
diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp
index 5f10e4dcc1..dd502faaa4 100644
--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <gather.hpp>
 #include <cstdlib>
 
diff --git a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
index e7ef67e080..311947c197 100644
--- a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
+++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
@@ -52,35 +52,33 @@
 
 #define HLINE "-------------------------------------------------------------\n"
 
-#if defined(KOKKOS_ENABLE_CUDA)
-using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
-#else
-using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
-#endif
+using StreamDeviceArray =
+    Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Restrict>>;
+using StreamHostArray = typename StreamDeviceArray::HostMirror;
 
 using StreamIndex = int;
+using Policy      = Kokkos::RangePolicy<Kokkos::IndexType<StreamIndex>>;
 
-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
-}
-
-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
-                  StreamDeviceArray& c) {
+void perform_set(StreamDeviceArray& a, const double scalar) {
   Kokkos::parallel_for(
-      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
+      "set", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; });
 
   Kokkos::fence();
 }
 
-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
-                   StreamDeviceArray& c, const double scalar) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) {
   Kokkos::parallel_for(
-      "copy", a.extent(0),
+      "copy", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; });
+
+  Kokkos::fence();
+}
+
+void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c,
+                   const double scalar) {
+  Kokkos::parallel_for(
+      "scale", Policy(0, b.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
 
   Kokkos::fence();
@@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
                  StreamDeviceArray& c) {
   Kokkos::parallel_for(
-      "add", a.extent(0),
+      "add", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
 
   Kokkos::fence();
@@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
                    StreamDeviceArray& c, const double scalar) {
   Kokkos::parallel_for(
-      "triad", a.extent(0),
+      "triad", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
 
   Kokkos::fence();
@@ -184,6 +182,7 @@ int run_benchmark() {
 
   const double scalar = 3.0;
 
+  double setTime   = std::numeric_limits<double>::max();
   double copyTime  = std::numeric_limits<double>::max();
   double scaleTime = std::numeric_limits<double>::max();
   double addTime   = std::numeric_limits<double>::max();
@@ -191,13 +190,10 @@ int run_benchmark() {
 
   printf("Initializing Views...\n");
 
-#if defined(KOKKOS_HAVE_OPENMP)
   Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
-#else
-  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
-#endif
+      "init",
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,
+                                                             STREAM_ARRAY_SIZE),
       KOKKOS_LAMBDA(const int i) {
         a[i] = 1.0;
         b[i] = 2.0;
@@ -209,26 +205,30 @@ int run_benchmark() {
   Kokkos::deep_copy(dev_b, b);
   Kokkos::deep_copy(dev_c, c);
 
-  double start;
-
   printf("Starting benchmarking...\n");
 
+  Kokkos::Timer timer;
+
   for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
-    start = now();
-    perform_copy(dev_a, dev_b, dev_c);
-    copyTime = std::min(copyTime, (now() - start));
+    timer.reset();
+    perform_set(dev_c, 1.5);
+    setTime = std::min(setTime, timer.seconds());
 
-    start = now();
-    perform_scale(dev_a, dev_b, dev_c, scalar);
-    scaleTime = std::min(scaleTime, (now() - start));
+    timer.reset();
+    perform_copy(dev_a, dev_c);
+    copyTime = std::min(copyTime, timer.seconds());
 
-    start = now();
+    timer.reset();
+    perform_scale(dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, timer.seconds());
+
+    timer.reset();
     perform_add(dev_a, dev_b, dev_c);
-    addTime = std::min(addTime, (now() - start));
+    addTime = std::min(addTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_triad(dev_a, dev_b, dev_c, scalar);
-    triadTime = std::min(triadTime, (now() - start));
+    triadTime = std::min(triadTime, timer.seconds());
   }
 
   Kokkos::deep_copy(a, dev_a);
@@ -240,6 +240,9 @@ int run_benchmark() {
 
   printf(HLINE);
 
+  printf("Set             %11.2f MB/s\n",
+         (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             setTime);
   printf("Copy            %11.2f MB/s\n",
          (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
              copyTime);
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
index 6af091a7d8..43f8a745da 100755
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
   > ${HPCBIND_OUT}
   if [[ ${HPCBIND_TEE} -eq 0 ]]; then
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     else
-      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     fi
   fi
 fi
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index 4e52e4d09f..27e7d15b9d 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -96,10 +96,10 @@ replace_pragma_ident=0
 first_xcompiler_arg=1
 
 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop)
-if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
+if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
   temp_dir=${TMPDIR:-/tmp}
 else
-  temp_dir=${NVCC_WRAPPER_TMPDIR+x}
+  temp_dir=${NVCC_WRAPPER_TMPDIR}
 fi
 
 # optimization flag added as a command-line argument
@@ -226,14 +226,14 @@ do
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
-  -maxrregcount=*|--maxrregcount=*)
+  -maxrregcount=*|--maxrregcount=*|-time=*)
     cuda_args="$cuda_args $1"
     ;;
-  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time)
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
@@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then
   $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command && $nvcc_depfile_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command"
   fi
-  $nvcc_command && $nvcc_depfile_command
+  TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command
 else
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command"
   fi
-  $nvcc_command
+  TMPDIR=${temp_dir} $nvcc_command
 fi
 error_code=$?
 
diff --git a/lib/kokkos/cmake/CTestConfig.cmake.in b/lib/kokkos/cmake/CTestConfig.cmake.in
deleted file mode 100644
index 1f82c0d64d..0000000000
--- a/lib/kokkos/cmake/CTestConfig.cmake.in
+++ /dev/null
@@ -1,91 +0,0 @@
-#----------------------------------------------------------------------------------------#
-#
-#   CTestConfig.cmake template for Kokkos
-#
-#----------------------------------------------------------------------------------------#
-
-#
-#   dash-board related
-#
-set(CTEST_PROJECT_NAME "Kokkos")
-set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
-set(CTEST_DROP_METHOD "https")
-set(CTEST_DROP_SITE "cdash.nersc.gov")
-set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
-set(CTEST_CDASH_VERSION "1.6")
-set(CTEST_CDASH_QUERY_VERSION TRUE)
-set(CTEST_SUBMIT_RETRY_COUNT "1")
-set(CTEST_SUBMIT_RETRY_DELAY "30")
-
-#
-#   configure/build related
-#
-set(CTEST_BUILD_NAME "@BUILD_NAME@")
-set(CTEST_MODEL "@MODEL@")
-set(CTEST_SITE "@SITE@")
-set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
-set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
-set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
-
-#
-#   configure/build related
-#
-set(CTEST_UPDATE_TYPE "git")
-set(CTEST_UPDATE_VERSION_ONLY ON)
-# set(CTEST_GENERATOR "")
-# set(CTEST_GENERATOR_PLATFORM "")
-
-#
-#   testing related
-#
-set(CTEST_TIMEOUT "7200")
-set(CTEST_TEST_TIMEOUT "7200")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
-set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
-
-#
-#   coverage related
-#
-set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
-
-#
-#   commands
-#
-if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
-    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
-endif()
-set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
-set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
-set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
-if(NOT WIN32)
-    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
-endif()
-set(CTEST_COVERAGE_COMMAND "gcov")
-set(CTEST_MEMORYCHECK_COMMAND "valgrind")
-set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
-
-#
-#   various configs
-#
-set(APPEND_VALUE @APPEND@)
-if(APPEND_VALUE)
-    set(APPEND_CTEST APPEND)
-endif()
-
-macro(SET_TEST_PROP VAR)
-    if(NOT "${ARGS}" STREQUAL "")
-        set(${VAR}_CTEST ${VAR} ${ARGN})
-    endif()
-endmacro()
-
-set_test_prop(START           @START@)
-set_test_prop(END             @END@)
-set_test_prop(STRIDE          @STRIDE@)
-set_test_prop(INCLUDE         @INCLUDE@)
-set_test_prop(EXCLUDE         @EXCLUDE@)
-set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
-set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
-set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
-set_test_prop(STOP_TIME       @STOP_TIME@)
-set_test_prop(COVERAGE_LABELS @LABELS@)
diff --git a/lib/kokkos/cmake/KokkosCI.cmake b/lib/kokkos/cmake/KokkosCI.cmake
deleted file mode 100644
index e8c9af37ad..0000000000
--- a/lib/kokkos/cmake/KokkosCI.cmake
+++ /dev/null
@@ -1,350 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-message(STATUS "")
-
-get_cmake_property(_cached_vars CACHE_VARIABLES)
-set(KOKKOS_CMAKE_ARGS)
-set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
-                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
-list(SORT _cached_vars)
-foreach(_var ${_cached_vars})
-    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
-        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
-        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
-            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-        endif()
-    endif()
-endforeach()
-
-
-#----------------------------------------------------------------------------------------#
-#
-#   Macros and variables
-#
-#----------------------------------------------------------------------------------------#
-
-macro(CHECK_REQUIRED VAR)
-    if(NOT DEFINED ${VAR})
-        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
-    endif()
-endmacro()
-
-# require the build name variable
-CHECK_REQUIRED(BUILD_NAME)
-
-# uses all args
-macro(SET_DEFAULT VAR)
-    if(NOT DEFINED ${VAR})
-        set(${VAR} ${ARGN})
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# uses first arg -- useful for selecting via priority from multiple
-# potentially defined variables, e.g.:
-#
-#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
-#
-macro(SET_DEFAULT_ARG1 VAR)
-    if(NOT DEFINED ${VAR})
-        foreach(_ARG ${ARGN})
-            if(NOT "${_ARG}" STREQUAL "")
-                set(${VAR} ${_ARG})
-                break()
-            endif()
-        endforeach()
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# determine the default working directory
-if(NOT "$ENV{WORKSPACE}" STREQUAL "")
-    set(WORKING_DIR "$ENV{WORKSPACE}")
-else()
-    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
-endif()
-
-# determine the hostname
-execute_process(COMMAND hostname
-    OUTPUT_VARIABLE HOSTNAME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
-
-# get the number of processors
-include(ProcessorCount)
-ProcessorCount(NUM_PROCESSORS)
-
-# find git
-find_package(Git QUIET)
-if(NOT GIT_EXECUTABLE)
-    unset(GIT_EXECUTABLE CACHE)
-    unset(GIT_EXECUTABLE)
-endif()
-
-function(EXECUTE_GIT_COMMAND VAR)
-    set(${VAR} "" PARENT_SCOPE)
-    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
-        OUTPUT_VARIABLE VAL
-        RESULT_VARIABLE RET
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
-        ERROR_QUIET)
-    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
-    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
-    if(RET EQUAL 0)
-        set(${VAR} "${VAL}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_BRANCH_NAME VAR)
-    execute_git_command(GIT_BRANCH branch --show-current)
-    set(_INVALID "%D" "HEAD")
-    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-        execute_git_command(GIT_BRANCH show -s --format=%D)
-        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-            execute_git_command(GIT_BRANCH --describe all)
-        endif()
-    endif()
-    #
-    if(GIT_BRANCH)
-        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
-        # just set it to last one via loop instead of wonky cmake index manip
-        foreach(_ITR ${_DESC})
-            set(GIT_BRANCH "${_ITR}")
-        endforeach()
-        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
-        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_AUTHOR_NAME VAR)
-    execute_git_command(GIT_AUTHOR show -s --format=%an)
-    if(GIT_AUTHOR)
-        string(LENGTH "${GIT_AUTHOR}" STRLEN)
-        # if the build name gets too long, this can cause submission errors
-        if(STRLEN GREATER 24)
-            # remove middle initial
-            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
-            # get first and sur name
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
-            if(S_NAME)
-                set(GIT_AUTHOR "${S_NAME}")
-            elseif(F_NAME)
-                set(GIT_AUTHOR "${F_NAME}")
-            endif()
-        endif()
-        # remove any spaces, quotes, periods, etc.
-        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
-        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
-        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
-    endif()
-endfunction()
-
-# get the name of the branch
-GET_GIT_BRANCH_NAME(GIT_BRANCH)
-# get the name of the author
-GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
-# author, prefer git method for consistency
-SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
-# SLUG == owner_name/repo_name
-SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
-# branch name
-SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
-# pull request number
-SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
-# get the event type, e.g. push, pull_request, api, cron, etc.
-SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
-
-if("${BRANCH}" STREQUAL "")
-    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
-    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
-endif()
-
-#----------------------------------------------------------------------------------------#
-#
-#   Set default values if not provided on command-line
-#
-#----------------------------------------------------------------------------------------#
-
-SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
-SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
-SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
-SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
-SET_DEFAULT(JOBS            1)                          # number of parallel ctests
-SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
-SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
-SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
-SET_DEFAULT(TARGET          "all")                      # build target
-SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
-                            "${HOSTNAME}")              # update site
-SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
-                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
-#
-#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
-#   '-I START,END,STRIDE'
-#
-SET_DEFAULT(START           "")
-SET_DEFAULT(END             "")
-SET_DEFAULT(STRIDE          "")
-SET_DEFAULT(INCLUDE         "")
-SET_DEFAULT(EXCLUDE         "")
-SET_DEFAULT(INCLUDE_LABEL   "")
-SET_DEFAULT(EXCLUDE_LABEL   "")
-SET_DEFAULT(PARALLEL_LEVEL  "")
-SET_DEFAULT(STOP_TIME       "")
-SET_DEFAULT(LABELS          "")
-SET_DEFAULT(NOTES           "")
-
-# default static build tag for Nightly
-set(BUILD_TAG "${BRANCH}")
-
-if(NOT BUILD_TYPE)
-    # default for kokkos if not specified
-    set(BUILD_TYPE "RelWithDebInfo")
-endif()
-
-# generate dynamic name if continuous or experimental model
-if(NOT "${MODEL}" STREQUAL "Nightly")
-    if(EVENT_TYPE AND PULL_REQUEST_NUM)
-        # e.g. pull_request/123
-        if(AUTHOR)
-            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        else()
-            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        endif()
-    elseif(SLUG)
-        # e.g. owner_name/repo_name
-        set(BUILD_TAG "${SLUG}")
-    elseif(AUTHOR)
-        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
-    endif()
-    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
-        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
-    endif()
-endif()
-
-# unnecessary
-string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
-string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
-
-message(STATUS "BUILD_TAG: ${BUILD_TAG}")
-
-set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
-
-# colons in build name create extra (empty) entries in CDash
-string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
-# unnecessary info
-string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
-# consistency
-string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
-# miscellaneous from missing fields
-string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
-
-# check binary directory
-if(EXISTS ${BINARY_DIR})
-    if(NOT IS_DIRECTORY "${BINARY_DIR}")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
-    endif()
-    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
-    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
-    endif()
-endif()
-
-get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
-get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
-
-#----------------------------------------------------------------------------------------#
-#
-#   Generate the CTestConfig.cmake
-#
-#----------------------------------------------------------------------------------------#
-
-set(CONFIG_ARGS)
-foreach(_ARG ${KOKKOS_CMAKE_ARGS})
-    if(NOT "${${_ARG}}" STREQUAL "")
-        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
-        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
-            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
-                set(_ARG_TYPE "BOOL")
-            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "FILEPATH")
-            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "PATH")
-            elseif(NOT "${${_ARG}}" STREQUAL "")
-                set(_ARG_TYPE "STRING")
-            endif()
-        endif()
-        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
-    endif()
-endforeach()
-
-file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
-"
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
-${CONFIG_ARGS}
-")
-
-file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
-message(STATUS "Initial cache:\n${_CACHE_INFO}")
-
-# initialize the cache
-set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
-
-
-# generate the CTestConfig.cmake
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
-    ${BINARY_REALDIR}/CTestConfig.cmake
-    @ONLY)
-
-# copy/generate the dashboard script
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
-    ${BINARY_REALDIR}/KokkosCTest.cmake
-    @ONLY)
-
-# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
-execute_process(
-    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-#----------------------------------------------------------------------------------------#
-#
-#   Execute CTest
-#
-#----------------------------------------------------------------------------------------#
-
-message(STATUS "")
-message(STATUS "BUILD_NAME: ${BUILD_NAME}")
-message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
-message(STATUS "")
-
-# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
-string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
-
-execute_process(
-    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
-    RESULT_VARIABLE     RET
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-# ensure that any non-zero result variable gets propagated
-if(NOT RET EQUAL 0)
-    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
-endif()
diff --git a/lib/kokkos/cmake/KokkosCTest.cmake.in b/lib/kokkos/cmake/KokkosCTest.cmake.in
deleted file mode 100644
index b6917f3cc1..0000000000
--- a/lib/kokkos/cmake/KokkosCTest.cmake.in
+++ /dev/null
@@ -1,261 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-endif()
-
-include(ProcessorCount)
-ProcessorCount(CTEST_PROCESSOR_COUNT)
-
-cmake_policy(SET CMP0009 NEW)
-cmake_policy(SET CMP0011 NEW)
-
-# ---------------------------------------------------------------------------- #
-# -- Commands
-# ---------------------------------------------------------------------------- #
-find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
-find_program(CTEST_UNAME_COMMAND    NAMES uname)
-
-find_program(CTEST_BZR_COMMAND      NAMES bzr)
-find_program(CTEST_CVS_COMMAND      NAMES cvs)
-find_program(CTEST_GIT_COMMAND      NAMES git)
-find_program(CTEST_HG_COMMAND       NAMES hg)
-find_program(CTEST_P4_COMMAND       NAMES p4)
-find_program(CTEST_SVN_COMMAND      NAMES svn)
-
-find_program(VALGRIND_COMMAND       NAMES valgrind)
-find_program(GCOV_COMMAND           NAMES gcov)
-find_program(LCOV_COMMAND           NAMES llvm-cov)
-find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
-
-set(MEMORYCHECK_TYPE Valgrind)
-# set(MEMORYCHECK_TYPE Purify)
-# set(MEMORYCHECK_TYPE BoundsChecker)
-# set(MEMORYCHECK_TYPE ThreadSanitizer)
-# set(MEMORYCHECK_TYPE AddressSanitizer)
-# set(MEMORYCHECK_TYPE LeakSanitizer)
-# set(MEMORYCHECK_TYPE MemorySanitizer)
-# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
-set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
-
-# ---------------------------------------------------------------------------- #
-# -- Settings
-# ---------------------------------------------------------------------------- #
-## -- Process timeout in seconds
-set(CTEST_TIMEOUT           "7200")
-## -- Set output to English
-set(ENV{LC_MESSAGES}        "en_EN" )
-
-
-# ---------------------------------------------------------------------------- #
-# -- Copy ctest configuration file
-# ---------------------------------------------------------------------------- #
-macro(COPY_CTEST_CONFIG_FILES)
-
-    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
-
-        # if current directory is not binary or source directory
-        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
-           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in current directory
-            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
-                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        # if source and binary differ
-        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in source directory but not in binary directory
-            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
-               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
-                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        endif()
-    endforeach()
-
-endmacro()
-
-ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
-
-message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
-
-#-------------------------------------------------------------------------#
-# Start
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
-message(STATUS "")
-
-ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
-    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
-
-
-#-------------------------------------------------------------------------#
-# Config
-#
-copy_ctest_config_files()
-ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
-
-
-#-------------------------------------------------------------------------#
-# Update
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
-message(STATUS "")
-
-ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
-    RETURN_VALUE up_ret)
-
-
-#-------------------------------------------------------------------------#
-# Configure
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
-message(STATUS "")
-
-ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
-    SOURCE ${CTEST_SOURCE_DIRECTORY}
-    ${APPEND_CTEST}
-    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
-    RETURN_VALUE config_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo configure log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_configure_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Configure Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Build
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
-message(STATUS "")
-
-ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
-    ${APPEND_CTEST}
-    RETURN_VALUE build_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo build log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_build_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Build Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Test
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
-message(STATUS "")
-
-ctest_test(RETURN_VALUE test_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST}
-    ${STOP_TIME_CTEST}
-    SCHEDULE_RANDOM OFF)
-
-
-#-------------------------------------------------------------------------#
-# Coverage
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
-message(STATUS "")
-
-execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
-    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
-    ERROR_QUIET)
-
-ctest_coverage(${APPEND_CTEST}
-    ${CTEST_COVERAGE_LABELS}
-    RETURN_VALUE cov_ret)
-
-
-#-------------------------------------------------------------------------#
-# MemCheck
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
-message(STATUS "")
-
-ctest_memcheck(RETURN_VALUE mem_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST})
-
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
-message(STATUS "")
-
-file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
-foreach(_FILE ${NOTE_FILES})
-    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
-    include("${_FILE}")
-endforeach()
-
-# capture submit error so it doesn't fail because of a submission error
-ctest_submit(RETURN_VALUE submit_ret
-    RETRY_COUNT 2
-    RETRY_DELAY 10
-    CAPTURE_CMAKE_ERROR submit_err)
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
-message(STATUS "")
-
-
-#-------------------------------------------------------------------------#
-# Non-zero exit codes for important errors
-#
-if(NOT config_ret EQUAL 0)
-    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
-endif()
-
-if(NOT build_ret EQUAL 0)
-    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
-endif()
-
-if(NOT test_ret EQUAL 0)
-    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
-endif()
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 3455b0cb42..07baa0a5f0 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -41,6 +41,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
@@ -49,17 +50,21 @@
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
+#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
+#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 
 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
+#cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
 
 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@@ -79,6 +84,12 @@
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
+#cmakedefine KOKKOS_ARCH_INTEL_DG1
+#cmakedefine KOKKOS_ARCH_INTEL_GEN9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN11
+#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP
+#cmakedefine KOKKOS_ARCH_INTEL_XEHP
+#cmakedefine KOKKOS_ARCH_INTEL_GPU
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@@ -95,6 +106,7 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
+#cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
index 8d58d96415..0c825c59e0 100644
--- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
@@ -29,7 +29,12 @@ ELSE()
 ENDIF()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1")
+ELSE()
+  SET(KOKKOS_CUDA_ERROR DEFAULT_MSG)
+ENDIF()
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER)
 IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
   KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
     LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
new file mode 100644
index 0000000000..be70b711e0
--- /dev/null
+++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
@@ -0,0 +1 @@
+KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)
diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake
new file mode 100644
index 0000000000..826f5021d3
--- /dev/null
+++ b/lib/kokkos/cmake/deps/quadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index e8b85542c6..c4637339f3 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "" )
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")
 
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
@@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
     "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")
 
+  # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
+  IF(Kokkos_ENABLE_LIBQUADMATH)
+    # warning: non-standard suffix on floating constant [-Wpedantic]
+    LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
+  ENDIF()
+
   # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
   IF(KOKKOS_ENABLE_OPENMPTARGET)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
@@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
 
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID CMAKE_CXX_COMPILER_ID
-    PGI         NO-VALUE-SPECIFIED
+    NVHPC       NO-VALUE-SPECIFIED
     GNU         ${GNU_WARNINGS}
     DEFAULT     ${COMMON_WARNINGS}
   )
@@ -158,16 +169,18 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ARMV80)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV81)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.1-a
   )
 ENDIF()
@@ -175,8 +188,9 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
   SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a -mtune=thunderx
   )
 ENDIF()
@@ -184,23 +198,28 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
   SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_A64FX)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.2-a+sve
-    Clang -march=armv8.2-a+sve -msve-vector-bits=512
-    GCC -march=armv8.2-a+sve -msve-vector-bits=512
+    Clang   -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC     -march=armv8.2-a+sve -msve-vector-bits=512
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen
     DEFAULT -march=znver1 -mtune=znver1
   )
   SET(KOKKOS_ARCH_AMD_ZEN  ON)
@@ -209,7 +228,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN2)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver2 -mtune=znver2
   )
   SET(KOKKOS_ARCH_AMD_ZEN2 ON)
@@ -218,7 +239,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN3)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver3 -mtune=znver3
   )
   SET(KOKKOS_ARCH_AMD_ZEN3 ON)
@@ -227,8 +250,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_WSM)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xSSE4.2
-    PGI     -tp=nehalem
+    NVHPC   -tp=px
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -msse4.2
   )
@@ -238,8 +262,9 @@ ENDIF()
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
   SET(KOKKOS_ARCH_AVX ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx
-    PGI     -tp=sandybridge
+    NVHPC   -tp=sandybridge
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -mavx
   )
@@ -248,8 +273,9 @@ ENDIF()
 IF (KOKKOS_ARCH_HSW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2
   )
@@ -258,8 +284,9 @@ ENDIF()
 IF (KOKKOS_ARCH_BDW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm
   )
@@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL)
   #avx512-mic
   SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xMIC-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=knl
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=knl -mtune=knl
   )
@@ -279,6 +307,7 @@ ENDIF()
 IF (KOKKOS_ARCH_KNC)
   SET(KOKKOS_USE_ISA_KNC ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     DEFAULT -mmic
   )
 ENDIF()
@@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX)
   #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=skylake
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
   )
@@ -304,7 +334,8 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER7)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=power7 -mtune=power7
   )
   SET(KOKKOS_USE_ISA_POWERPCBE ON)
@@ -312,16 +343,16 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER8)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr8
     DEFAULT -mcpu=power8 -mtune=power8
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER9)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr9
     DEFAULT -mcpu=power9 -mtune=power9
   )
 ENDIF()
@@ -368,7 +399,7 @@ ENDIF()
 
 IF (KOKKOS_ENABLE_SYCL)
   COMPILER_SPECIFIC_FLAGS(
-    DEFAULT -fsycl
+    DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int
   )
   COMPILER_SPECIFIC_OPTIONS(
     DEFAULT -fsycl-unnamed-lambda
@@ -443,20 +474,58 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
 CHECK_AMDGPU_ARCH(VEGA908 gfx908)
+CHECK_AMDGPU_ARCH(VEGA90A gfx90a)
 
 IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
-                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator)
+    EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS)
+    STRING(LENGTH "${GPU_ARCHS}" len_str)
+    # enumerator always output gfx000 as the first line
+    IF(${len_str} LESS 8)
+      MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                         "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+    ENDIF()
+  ELSE()
+    MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                       "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  ENDIF()
+ENDIF()
+
+MACRO(CHECK_MULTIPLE_INTEL_ARCH)
+  IF(KOKKOS_ARCH_INTEL_GPU)
+    MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!")
+  ENDIF()
+  SET(KOKKOS_ARCH_INTEL_GPU ON)
+ENDMACRO()
+
+IF(KOKKOS_ARCH_INTEL_GEN)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_DG1)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN9)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN11)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN12LP)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_XEHP)
+  CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()
 
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
     COMPILER_SPECIFIC_FLAGS(
       Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
-      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
-      PGI -gpu=${PGI_CUDA_ARCH}
+      XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      NVHPC -gpu=${NVHPC_CUDA_ARCH}
     )
   ENDIF()
   SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
       Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GEN)
+  IF (KOKKOS_ARCH_INTEL_GPU)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
@@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL)
     ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
     )
   ENDIF()
 ENDIF()
diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake
index 23847263a9..5afed4fb0e 100644
--- a/lib/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake
@@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      4.0.0 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        5.3.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     17.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.2.88 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.8.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      4.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.4 or higher\n")
 
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   ENDIF()
   SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
   IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
+  # Treat PGI internally as NVHPC to simplify handling both compilers.
+  # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is
+  # backward-compatible to pgc++.
+  SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
+ENDIF()
+
+IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
 ENDIF()
 
 STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION})
diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake
index d7f83ddbdf..7fd0794036 100644
--- a/lib/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake
@@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
       IntelLLVM  -Xcompiler -fiopenmp
-      PGI        -Xcompiler -mp
+      NVHPC      -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
       DEFAULT    -Xcompiler -fopenmp
@@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       Clang      ${ClangOpenMPFlag}
       IntelLLVM  -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
-      PGI        -mp
+      NVHPC      -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -qsmp=omp
       DEFAULT    -fopenmp
@@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
     Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
     IntelLLVM  -fiopenmp -Wno-openmp-mapping
     XL         -qsmp=omp -qoffload -qnoeh
-    PGI        -mp=gpu
+    NVHPC      -mp=gpu
     DEFAULT    -fopenmp
   )
   COMPILER_SPECIFIC_DEFS(
diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake
index 95bce66c7b..4cb8bd20f5 100644
--- a/lib/kokkos/cmake/kokkos_enable_options.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_options.cmake
@@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)
 
+# Set the Default for Desul Atomics usage.
+set(_DESUL_ATOMICS_DEFAULT ON)
+
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
+# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    ON "Whether code deprecated in major release 3 is available" )
+KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
 KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
@@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tu
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
 KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 
+# This option will go away eventually, but allows fallback to old implementation when needed.
+KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
 ENDIF()
diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake
index e1a3e5f8bd..02c9a911b1 100644
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
index 707fb000af..1eb0592c7f 100644
--- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
   IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake)
     kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake)
     kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
   ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake
index d8d044c9d7..51bad521c4 100644
--- a/lib/kokkos/cmake/kokkos_tpls.cmake
+++ b/lib/kokkos/cmake/kokkos_tpls.cmake
@@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread)
 
+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath)
+  SET(LIBQUADMATH_DEFAULT ON)
+ELSE()
+  SET(LIBQUADMATH_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)
 
 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
@@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
new file mode 100644
index 0000000000..1f7587da80
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
index 8c507c7662..7ed9a0271a 100644
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -48,7 +48,7 @@
 #include <Kokkos_DynRankView.hpp>
 #include <vector>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // Compare performance of DynRankView to View, specific focus on the parenthesis
 // operators
diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index 65de551b27..16b74a4997 100644
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -48,7 +48,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // This test will simulate global ids
 
diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
index 0f3ba103ef..8a23f59d32 100644
--- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -46,7 +46,7 @@
 #define KOKKOS_TEST_SCATTER_VIEW_HPP
 
 #include <Kokkos_ScatterView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Perf {
 
diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index c31412552a..4547d5c357 100644
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -43,7 +43,7 @@
 #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 #include <iostream>
 #include <iomanip>
diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
index ea1d6dde5d..c5b66f05a3 100644
--- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -76,20 +76,25 @@ class Bitset {
   using execution_space = Device;
   using size_type       = unsigned int;
 
-  enum { BIT_SCAN_REVERSE = 1u };
-  enum { MOVE_HINT_BACKWARD = 2u };
+  static constexpr unsigned BIT_SCAN_REVERSE   = 1u;
+  static constexpr unsigned MOVE_HINT_BACKWARD = 2u;
 
-  enum {
-    BIT_SCAN_FORWARD_MOVE_HINT_FORWARD  = 0u,
-    BIT_SCAN_REVERSE_MOVE_HINT_FORWARD  = BIT_SCAN_REVERSE,
-    BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD,
-    BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
-  };
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD =
+      BIT_SCAN_REVERSE;
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD =
+      MOVE_HINT_BACKWARD;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD =
+      BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD;
 
  private:
-  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
-  enum { block_mask = block_size - 1u };
-  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+  enum : unsigned {
+    block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT)
+  };
+  enum : unsigned { block_mask = block_size - 1u };
+  enum : unsigned {
+    block_shift = Kokkos::Impl::integral_power_of_two(block_size)
+  };
 
  public:
   /// constructor
@@ -317,14 +322,18 @@ class ConstBitset {
   enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
 
  public:
+  KOKKOS_FUNCTION
   ConstBitset() : m_size(0) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(Bitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(ConstBitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(Bitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
@@ -332,6 +341,7 @@ class ConstBitset {
     return *this;
   }
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
index 45710d1f73..f55d0f2b7f 100644
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -597,8 +597,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
     if (std::is_same<typename t_host::memory_space,
                      typename t_dev::memory_space>::value) {
-      typename t_dev::execution_space().fence();
-      typename t_host::execution_space().fence();
+      typename t_dev::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
+      typename t_host::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
     }
   }
 
@@ -776,10 +778,11 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// If \c Device is the same as this DualView's device type, then
   /// mark the device's data as modified.  Otherwise, mark the host's
   /// data as modified.
-  template <class Device>
+  template <class Device, class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   void modify() {
     if (modified_flags.data() == nullptr) return;
-    if (impl_dualview_is_single_device::value) return;
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -811,8 +814,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 #endif
   }
 
+  template <
+      class Device, class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  void modify() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_host() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(0) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -832,8 +844,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_host() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_device() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(1) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -853,6 +874,13 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_device() {
+    return;
+  }
+
   inline void clear_sync_state() {
     if (modified_flags.data() != nullptr)
       modified_flags(1) = modified_flags(0) = 0;
@@ -875,8 +903,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-    h_view = create_mirror_view(d_view);
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
+    if (sizeMismatch) {
+      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      h_view = create_mirror_view(d_view);
+    } else
+      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
 
     /* Reset dirty flags */
     if (modified_flags.data() == nullptr) {
@@ -897,41 +932,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
     if (modified_flags.data() == nullptr) {
       modified_flags = t_modified_flags("DualView::modified_flags");
     }
     if (modified_flags(1) >= modified_flags(0)) {
       /* Resize on Device */
-      ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(d_view);
-
-      /* Mark Device copy as modified */
-      modified_flags(1) = modified_flags(1) + 1;
+      if (sizeMismatch) {
+        ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        h_view = create_mirror_view(d_view);
 
+        /* Mark Device copy as modified */
+        modified_flags(1) = modified_flags(1) + 1;
+      }
     } else {
       /* Realloc on Device */
-
-      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-
-      const bool sizeMismatch =
-          (h_view.extent(0) != n0) || (h_view.extent(1) != n1) ||
-          (h_view.extent(2) != n2) || (h_view.extent(3) != n3) ||
-          (h_view.extent(4) != n4) || (h_view.extent(5) != n5) ||
-          (h_view.extent(6) != n6) || (h_view.extent(7) != n7);
-      if (sizeMismatch)
+      if (sizeMismatch) {
         ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
 
-      t_host temp_view = create_mirror_view(d_view);
-
-      /* Remap on Host */
-      Kokkos::deep_copy(temp_view, h_view);
-
-      h_view = temp_view;
-
-      d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
-
-      /* Mark Host copy as modified */
-      modified_flags(0) = modified_flags(0) + 1;
+        /* Mark Host copy as modified */
+        modified_flags(0) = modified_flags(0) + 1;
+      }
     }
   }
 
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
index c6323fef93..b673c53a4e 100644
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1404,7 +1406,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -1574,7 +1576,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs,
 namespace Kokkos {
 namespace Impl {
 
-template <class OutputView, typename Enable = void>
+template <class OutputView, class Enable = void>
 struct DynRankViewFill {
   using const_value_type = typename OutputView::traits::const_value_type;
 
@@ -1693,9 +1695,11 @@ inline void deep_copy(
                    typename ViewTraits<DT, DP...>::value_type>::value,
       "deep_copy requires non-const type");
 
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence before filling view");
   Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence after filling view");
 }
 
 /** \brief  Deep copy into a value in Host memory from a view.  */
@@ -1711,10 +1715,13 @@ inline void deep_copy(
 
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence before copying "
+      "value");
   Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                       sizeof(ST));
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence after copying value");
 }
 
 //----------------------------------------------------------------------------
@@ -1744,14 +1751,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if ((void*)dst.data() != (void*)src.data()) {
@@ -1762,10 +1769,14 @@ inline void deep_copy(
     // memory then can byte-wise copy
     if (rank(src) == 0 && rank(dst) == 0) {
       using value_type = typename dst_type::value_type;
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-0 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), sizeof(value_type));
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-0 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1787,10 +1798,14 @@ inline void deep_copy(
                dst.extent(6) == src.extent(6) &&
                dst.extent(7) == src.extent(7)) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1817,29 +1832,43 @@ inline void deep_copy(
                dst.stride_6() == src.stride_6() &&
                dst.stride_7() == src.stride_7()) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (DstExecCanAccessSrc) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else if (SrcExecCanAccessDst) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
           dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same "
+        "src and dst");
   }
 }
 
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index cc949d4c55..4acae56970 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -53,36 +53,203 @@
 namespace Kokkos {
 namespace Experimental {
 
-// Simple metafunction for choosing memory space
-// In the current implementation, if memory_space == CudaSpace,
-// use CudaUVMSpace for the chunk 'array' allocation, which
-// contains will contain pointers to chunks of memory allocated
-// in CudaSpace
 namespace Impl {
-template <class MemSpace>
-struct ChunkArraySpace {
-  using memory_space = MemSpace;
+
+/// Utility class to manage memory for chunked arrays on the host and
+/// device. Allocates/deallocates memory on both the host and device along with
+/// providing utilities for creating mirrors and deep copying between them.
+template <typename MemorySpace, typename ValueType>
+struct ChunkedArrayManager {
+  using value_type   = ValueType;
+  using pointer_type = ValueType*;
+  using track_type   = Kokkos::Impl::SharedAllocationTracker;
+
+  ChunkedArrayManager()                           = default;
+  ChunkedArrayManager(ChunkedArrayManager const&) = default;
+  ChunkedArrayManager(ChunkedArrayManager&&)      = default;
+  ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default;
+  ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default;
+
+  template <typename Space, typename Value>
+  friend struct ChunkedArrayManager;
+
+  template <typename Space, typename Value>
+  inline ChunkedArrayManager(const ChunkedArrayManager<Space, Value>& rhs)
+      : m_valid(rhs.m_valid),
+        m_chunk_max(rhs.m_chunk_max),
+        m_chunks((ValueType**)(rhs.m_chunks)),
+        m_track(rhs.m_track),
+        m_chunk_size(rhs.m_chunk_size) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<MemorySpace, Space>::assignable,
+        "Incompatible ChunkedArrayManager copy construction");
+  }
+
+  ChunkedArrayManager(const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ private:
+  struct ACCESSIBLE_TAG {};
+  struct INACCESSIBLE_TAG {};
+
+  ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks,
+                      const unsigned arg_chunk_max)
+      : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {}
+
+  ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ public:
+  template <typename Space, typename Enable_ = void>
+  struct IsAccessibleFrom;
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::true_type {};
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<!Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::false_type {};
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    return ChunkedArrayManager<Space, ValueType>{
+        ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
+  }
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    using tag_type =
+        typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
+    return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
+                                                 other.m_chunk_size};
+  }
+
+ public:
+  void allocate_device(const std::string& label) {
+    if (m_chunks == nullptr) {
+      m_chunks = reinterpret_cast<pointer_type*>(MemorySpace().allocate(
+          label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2))));
+    }
+  }
+
+  void initialize() {
+    for (unsigned i = 0; i < m_chunk_max + 2; i++) {
+      m_chunks[i] = nullptr;
+    }
+    m_valid = true;
+  }
+
+ private:
+  /// Custom destroy functor for deallocating array chunks along with a linked
+  /// allocation
+  template <typename Space>
+  struct Destroy {
+    Destroy()               = default;
+    Destroy(Destroy&&)      = default;
+    Destroy(const Destroy&) = default;
+    Destroy& operator=(Destroy&&) = default;
+    Destroy& operator=(const Destroy&) = default;
+
+    Destroy(std::string label, value_type** arg_chunk,
+            const unsigned arg_chunk_max, const unsigned arg_chunk_size,
+            value_type** arg_linked)
+        : m_label(label),
+          m_chunks(arg_chunk),
+          m_linked(arg_linked),
+          m_chunk_max(arg_chunk_max),
+          m_chunk_size(arg_chunk_size) {}
+
+    void execute() {
+      // Destroy the array of chunk pointers.
+      // Two entries beyond the max chunks are allocation counters.
+      uintptr_t const len =
+          *reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+      for (unsigned i = 0; i < len; i++) {
+        Space().deallocate(m_label.c_str(), m_chunks[i],
+                           sizeof(value_type) * m_chunk_size);
+      }
+      // Destroy the linked allocation if we have one.
+      if (m_linked != nullptr) {
+        Space().deallocate(m_label.c_str(), m_linked,
+                           (sizeof(value_type*) * (m_chunk_max + 2)));
+      }
+    }
+
+    void destroy_shared_allocation() { execute(); }
+
+    std::string m_label;
+    value_type** m_chunks = nullptr;
+    value_type** m_linked = nullptr;
+    unsigned m_chunk_max;
+    unsigned m_chunk_size;
+  };
+
+ public:
+  template <typename Space>
+  void allocate_with_destroy(const std::string& label,
+                             pointer_type* linked_allocation = nullptr) {
+    using destroy_type = Destroy<Space>;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<MemorySpace, destroy_type>;
+
+    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
+    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
+    // Destroy's execute(...) method
+    record_type* const record = record_type::allocate(
+        MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2)));
+    m_chunks = static_cast<pointer_type*>(record->data());
+    m_track.assign_allocated_record_to_uninitialized(record);
+
+    record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size,
+                                     linked_allocation);
+  }
+
+  pointer_type* get_ptr() const { return m_chunks; }
+
+  template <typename Space>
+  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const& other) {
+    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
+        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
+  }
+
+  template <typename Space>
+  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const&) {
+    // no-op
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type* operator+(int i) const { return m_chunks + i; }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type& operator[](int i) const { return m_chunks[i]; }
+
+  track_type const& track() const { return m_track; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool valid() const { return m_valid; }
+
+ private:
+  bool m_valid           = false;
+  unsigned m_chunk_max   = 0;
+  pointer_type* m_chunks = nullptr;
+  track_type m_track;
+  unsigned m_chunk_size = 0;
 };
 
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ChunkArraySpace<Kokkos::CudaSpace> {
-  using memory_space = typename Kokkos::CudaUVMSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
-  using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
-  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
-};
-#endif
-}  // end namespace Impl
+} /* end namespace Impl */
 
 /** \brief Dynamic views are restricted to rank-one and no layout.
  *         Resize only occurs on host outside of parallel_regions.
@@ -93,6 +260,13 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  public:
   using traits = Kokkos::ViewTraits<DataType, P...>;
 
+  using value_type   = typename traits::value_type;
+  using device_space = typename traits::memory_space;
+  using host_space =
+      typename Kokkos::Impl::HostMirror<device_space>::Space::memory_space;
+  using device_accessor = Impl::ChunkedArrayManager<device_space, value_type>;
+  using host_accessor   = Impl::ChunkedArrayManager<host_space, value_type>;
+
  private:
   template <class, class...>
   friend class DynamicView;
@@ -108,7 +282,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                 "DynamicView only implemented for non-specialized View type");
 
   template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
+                             Space, device_space>::accessible>
   struct verify_space {
     KOKKOS_FORCEINLINE_FUNCTION static void check() {}
   };
@@ -123,9 +297,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   };
 
  private:
-  track_type m_track;
-  typename traits::value_type** m_chunks =
-      nullptr;             // array of pointers to 'chunks' of memory
+  device_accessor m_chunks;
+  host_accessor m_chunks_host;
   unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
   unsigned m_chunk_mask;   // m_chunk_size - 1
   unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@@ -173,7 +346,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   KOKKOS_INLINE_FUNCTION
   size_t allocation_extent() const noexcept {
-    uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max);
+    uintptr_t n =
+        *reinterpret_cast<const uintptr_t*>(m_chunks_host + m_chunk_max);
     return (n << m_chunk_shift);
   }
 
@@ -183,7 +357,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   KOKKOS_INLINE_FUNCTION
   size_t size() const noexcept {
     size_t extent_0 =
-        *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1);
+        *reinterpret_cast<const size_t*>(m_chunks_host + m_chunk_max + 1);
     return extent_0;
   }
 
@@ -215,10 +389,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   // Allocation tracking properties
 
   KOKKOS_INLINE_FUNCTION
-  int use_count() const { return m_track.use_count(); }
+  int use_count() const { return m_chunks_host.track().use_count(); }
 
   inline const std::string label() const {
-    return m_track.template get_label<typename traits::memory_space>();
+    return m_chunks_host.track().template get_label<host_space>();
   }
 
   //----------------------------------------------------------------------
@@ -285,13 +459,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *          up to the maximum number of chunks
    * */
   template <typename IntType>
-  inline typename std::enable_if<
-      std::is_integral<IntType>::value &&
-      Kokkos::Impl::MemorySpaceAccess<
-          Kokkos::HostSpace,
-          typename Impl::ChunkArraySpace<
-              typename traits::memory_space>::memory_space>::accessible>::type
-  resize_serial(IntType const& n) {
+  inline void resize_serial(IntType const& n) {
     using local_value_type   = typename traits::value_type;
     using value_pointer_type = local_value_type*;
 
@@ -304,37 +472,40 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     }
 
     // *m_chunks[m_chunk_max] stores the current number of chunks being used
-    uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
-    std::string _label =
-        m_track.template get_label<typename traits::memory_space>();
+    uintptr_t* const pc =
+        reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
+    std::string _label = m_chunks_host.track().template get_label<host_space>();
+
     if (*pc < NC) {
       while (*pc < NC) {
-        m_chunks[*pc] = reinterpret_cast<value_pointer_type>(
-            typename traits::memory_space().allocate(
+        m_chunks_host[*pc] =
+            reinterpret_cast<value_pointer_type>(device_space().allocate(
                 _label.c_str(), sizeof(local_value_type) << m_chunk_shift));
         ++*pc;
       }
     } else {
       while (NC + 1 <= *pc) {
         --*pc;
-        typename traits::memory_space().deallocate(
-            _label.c_str(), m_chunks[*pc],
-            sizeof(local_value_type) << m_chunk_shift);
-        m_chunks[*pc] = nullptr;
+        device_space().deallocate(_label.c_str(), m_chunks_host[*pc],
+                                  sizeof(local_value_type) << m_chunk_shift);
+        m_chunks_host[*pc] = nullptr;
       }
     }
-    // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+    // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
     *(pc + 1) = n;
+
+    m_chunks_host.deep_copy_to(m_chunks);
   }
 
   KOKKOS_INLINE_FUNCTION bool is_allocated() const {
-    if (m_chunks == nullptr) {
-      return false;
-    } else {
-      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+    if (m_chunks_host.valid()) {
+      // *m_chunks_host[m_chunk_max] stores the current number of chunks being
+      // used
       uintptr_t* const pc =
-          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+          reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
       return (*(pc + 1) > 0);
+    } else {
+      return false;
     }
   }
 
@@ -349,8 +520,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   template <class RT, class... RP>
   DynamicView(const DynamicView<RT, RP...>& rhs)
-      : m_track(rhs.m_track),
-        m_chunks((typename traits::value_type**)rhs.m_chunks),
+      : m_chunks(rhs.m_chunks),
+        m_chunks_host(rhs.m_chunks_host),
         m_chunk_shift(rhs.m_chunk_shift),
         m_chunk_mask(rhs.m_chunk_mask),
         m_chunk_max(rhs.m_chunk_max),
@@ -361,63 +532,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                   "Incompatible DynamicView copy construction");
   }
 
-  //----------------------------------------------------------------------
-
-  struct Destroy {
-    using local_value_type = typename traits::value_type;
-    std::string m_label;
-    local_value_type** m_chunks;
-    unsigned m_chunk_max;
-    bool m_destroy;
-    unsigned m_chunk_size;
-
-    // Initialize or destroy array of chunk pointers.
-    // Two entries beyond the max chunks are allocation counters.
-    inline void operator()(unsigned i) const {
-      if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) {
-        typename traits::memory_space().deallocate(
-            m_label.c_str(), m_chunks[i],
-            sizeof(local_value_type) * m_chunk_size);
-      }
-      m_chunks[i] = nullptr;
-    }
-
-    void execute(bool arg_destroy) {
-      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
-
-      m_destroy = arg_destroy;
-
-      Kokkos::Impl::ParallelFor<Destroy, Range> closure(
-          *this,
-          Range(0, m_chunk_max + 2));  // Add 2 to 'destroy' extra slots storing
-                                       // num_chunks and extent; previously + 1
-
-      closure.execute();
-
-      typename traits::execution_space().fence();
-      // Impl::ChunkArraySpace< typename traits::memory_space
-      // >::memory_space::execution_space().fence();
-    }
-
-    void construct_shared_allocation() { execute(false); }
-
-    void destroy_shared_allocation() { execute(true); }
-
-    Destroy()               = default;
-    Destroy(Destroy&&)      = default;
-    Destroy(const Destroy&) = default;
-    Destroy& operator=(Destroy&&) = default;
-    Destroy& operator=(const Destroy&) = default;
-
-    Destroy(std::string label, typename traits::value_type** arg_chunk,
-            const unsigned arg_chunk_max, const unsigned arg_chunk_size)
-        : m_label(label),
-          m_chunks(arg_chunk),
-          m_chunk_max(arg_chunk_max),
-          m_destroy(false),
-          m_chunk_size(arg_chunk_size) {}
-  };
-
   /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks
@@ -427,10 +541,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   explicit inline DynamicView(const std::string& arg_label,
                               const unsigned min_chunk_size,
                               const unsigned max_extent)
-      : m_track(),
-        m_chunks(nullptr)
-        // The chunk size is guaranteed to be a power of two
-        ,
+      :  // The chunk size is guaranteed to be a power of two
         m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
             min_chunk_size))  // div ceil(log2(min_chunk_size))
         ,
@@ -440,28 +551,22 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                     m_chunk_shift)  // max num pointers-to-chunks in array
         ,
         m_chunk_size(2 << (m_chunk_shift - 1)) {
-    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space;
-    // A functor to deallocate all of the chunks upon final destruction
-    using record_type =
-        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
+    m_chunks = device_accessor(m_chunk_max, m_chunk_size);
 
-    // Allocate chunk pointers and allocation counter
-    record_type* const record =
-        record_type::allocate(chunk_array_memory_space(), arg_label,
-                              (sizeof(pointer_type) * (m_chunk_max + 2)));
-    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
-    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
-    // Destroy's execute(...) method
-
-    m_chunks = reinterpret_cast<pointer_type*>(record->data());
-
-    record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size);
-
-    // Initialize to zero
-    record->m_destroy.construct_shared_allocation();
-
-    m_track.assign_allocated_record_to_uninitialized(record);
+    if (device_accessor::template IsAccessibleFrom<host_space>::value) {
+      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.initialize();
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+    } else {
+      m_chunks.allocate_device(arg_label);
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+      m_chunks_host.template allocate_with_destroy<device_space>(
+          arg_label, m_chunks.get_ptr());
+      m_chunks_host.initialize();
+      m_chunks_host.deep_copy_to(m_chunks);
+    }
   }
 };
 
@@ -487,8 +592,8 @@ inline void deep_copy(const View<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
@@ -512,8 +617,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
diff --git a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
index fbfaed9b1b..18f026dc6f 100644
--- a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -187,7 +187,8 @@ template <typename ReportType, typename DeviceType>
 void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
   m_reports.resize(new_size);
   m_reporters.resize(new_size);
-  typename DeviceType::execution_space().fence();
+  typename DeviceType::execution_space().fence(
+      "Kokkos::Experimental::ErrorReporter::resize: fence after resizing");
 }
 
 }  // namespace Experimental
diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
index 0f21a08ba3..57bf745d40 100644
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
       This check should cover the case of Views that don't
       have the Unmanaged trait but were initialized by pointer. */
     if (tracker.has_record()) {
-      Kokkos::Impl::operator_bounds_error_on_device<MapType>(
-          map, Kokkos::Impl::has_printable_label_typedef<MapType>());
+      Kokkos::Impl::operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("OffsetView bounds error");
     }
@@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
index dcd4cf73e5..79bc43b739 100644
--- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -834,7 +834,7 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                   "ScatterView contribute destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView contribute destination memory space not accessible");
     if (dest.data() == internal_view.data()) return;
@@ -1061,7 +1061,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                Kokkos::LayoutRight>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     bool is_equal = (dest.data() == internal_view.data());
@@ -1290,7 +1290,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                                Kokkos::LayoutLeft>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     auto extent   = internal_view.extent(internal_view_type::rank - 1);
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 81be3ee2d3..cd633e4031 100644
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -405,7 +405,9 @@ class StaticCrsGraph {
     Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning",
                          Kokkos::RangePolicy<execution_space>(0, numRows()),
                          partitioner);
-    typename device_type::execution_space().fence();
+    typename device_type::execution_space().fence(
+        "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after "
+        "partition");
 
     row_block_offsets = block_offsets;
   }
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index edb0e7261d..a1601eee35 100644
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -345,7 +345,8 @@ class UnorderedMap {
       const impl_value_type tmp = impl_value_type();
       Kokkos::deep_copy(m_values, tmp);
     }
-    { Kokkos::deep_copy(m_scalars, 0); }
+    Kokkos::deep_copy(m_scalars, 0);
+    m_size = 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@@ -393,9 +394,9 @@ class UnorderedMap {
   ///
   /// This method has undefined behavior when erasable() is true.
   ///
-  /// Note that this is not a device function; it cannot be called in
+  /// Note that this is <i>not</i> a device function; it cannot be called in
   /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
+  /// must be computed. m_size is a mutable cache of that value.
   size_type size() const {
     if (capacity() == 0u) return 0u;
     if (modified()) {
@@ -419,9 +420,13 @@ class UnorderedMap {
   bool begin_erase() {
     bool result = !erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence before setting erasable "
+          "flag");
       set_flag(erasable_idx);
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence after setting erasable "
+          "flag");
     }
     return result;
   }
@@ -429,10 +434,12 @@ class UnorderedMap {
   bool end_erase() {
     bool result = erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence before erasing");
       Impl::UnorderedMapErase<declared_map_type> f(*this);
       f.apply();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence after erasing");
       reset_flag(erasable_idx);
     }
     return result;
diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp
index a1fbba6b21..88721bd89e 100644
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@@ -119,12 +119,14 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     if (DV::template need_sync<typename DV::t_dev::device_type>()) {
       set_functor_host f(DV::h_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_host::execution_space().fence();
+      typename DV::t_host::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_host::device_type>();
     } else {
       set_functor f(DV::d_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_dev::execution_space().fence();
+      typename DV::t_dev::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_dev::device_type>();
     }
   }
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 6047e60f3d..9512f2d4a2 100644
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -57,22 +57,10 @@
 namespace Kokkos {
 namespace Impl {
 
-KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_left(unsigned i, int r) {
-  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  return r ? ((i << r) | (i >> (size - r))) : i;
-}
-
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
   constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  // FIXME_SYCL llvm.fshr.i32 missing
-  // (https://github.com/intel/llvm/issues/3308)
-#ifdef __SYCL_DEVICE_ONLY__
-  return rotate_left(i, size - r);
-#else
   return r ? ((i >> r) | (i << (size - r))) : i;
-#endif
 }
 
 template <typename Bitset>
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
index 367ab33857..fdd78e4e5f 100644
--- a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) {
 
 KOKKOS_INLINE_FUNCTION
 uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
-  const uint8_t* data = (const uint8_t*)key;
+  const uint8_t* data = static_cast<const uint8_t*>(key);
   const int nblocks   = len / 4;
 
   uint32_t h1 = seed;
diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp
index 3eee85ed10..e22564aa5c 100644
--- a/lib/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@@ -49,7 +49,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_DualView.hpp>
 
 namespace Test {
diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index dd0199ed81..a8d62bd24c 100644
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -702,6 +702,11 @@ class TestDynViewAPI {
 
   using View0 = Kokkos::View<T, device>;
   using View1 = Kokkos::View<T*, device>;
+  using View2 = Kokkos::View<T**, device>;
+  using View3 = Kokkos::View<T***, device>;
+  using View4 = Kokkos::View<T****, device>;
+  using View5 = Kokkos::View<T*****, device>;
+  using View6 = Kokkos::View<T******, device>;
   using View7 = Kokkos::View<T*******, device>;
 
   using host_view_space = typename View0::host_mirror_space;
@@ -1065,7 +1070,7 @@ class TestDynViewAPI {
 
     dView0 d_uninitialized(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20);
-    ASSERT_TRUE(d_uninitialized.data() != nullptr);
+    ASSERT_NE(d_uninitialized.data(), nullptr);
     ASSERT_EQ(d_uninitialized.rank(), 2);
     ASSERT_EQ(d_uninitialized.extent(0), 10);
     ASSERT_EQ(d_uninitialized.extent(1), 20);
@@ -1075,14 +1080,14 @@ class TestDynViewAPI {
     hView0 hx, hy, hz;
 
     ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value);
-    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value);
+    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double>>::value);
 
-    ASSERT_TRUE(dx.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dy.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dz.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dy.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dz.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dy.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dz.extent(0), 0u);  // Okay with UVM
@@ -1153,11 +1158,11 @@ class TestDynViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1317,17 +1322,17 @@ class TestDynViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dy = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dz = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
 
     // View - DynRankView Interoperability tests
     // deep_copy from view to dynrankview
@@ -1367,7 +1372,7 @@ class TestDynViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::DynRankView<const DataType, device>& arg_const,
       const Kokkos::DynRankView<DataType, device>& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_allocated() {
@@ -1396,8 +1401,8 @@ class TestDynViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1406,7 +1411,7 @@ class TestDynViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc ; // setting non-const from const must not compile
@@ -1659,29 +1664,29 @@ class TestDynViewAPI {
     const_svector_right_type cvr3 =
         Kokkos::subdynrankview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_svector_type c_cv1(v1);
     typename svector_type::const_type c_cv2(v2);
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index f018793dd6..023bf92f62 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -52,7 +52,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <Kokkos_DynamicView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Test {
 
diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
index 9ddc226e29..24a43e1ebc 100644
--- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -50,7 +50,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 
diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
index fdbce2d492..342ce2af48 100644
--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       scatter_access(k, 3)++;
       scatter_access(k, 4)--;
       scatter_access(k, 5) -= 5;
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 6) += 2;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 7)++;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 8)--;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       --scatter_access_atomic(k, 9);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       ++scatter_access_atomic(k, 10);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access(k, 11) -= 3;
     }
   }
diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
index a9a178f95e..c9a3eed90c 100644
--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -180,8 +180,6 @@ void run_test_graph3(size_t B, size_t N) {
 
   std::vector<size_t> sizes(LENGTH);
 
-  size_t total_length = 0;
-
   for (size_t i = 0; i < LENGTH; ++i) {
     sizes[i] = rand() % 1000;
   }
@@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) {
   sizes[1]    = N;
   sizes[1998] = N;
 
-  for (size_t i = 0; i < LENGTH; ++i) {
-    total_length += sizes[i];
-  }
-
   int C    = 0;
   dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes);
   dx.create_block_partitioning(B, C);
diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 4413cfbc80..8009b99656 100644
--- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) {
 }
 
 // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
-// FIXME_HIP
 // WORKAROUND MSVC
-#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
-    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
+#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
   for (int i = 0; i < 500; ++i) {
     test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
   ASSERT_TRUE(n.is_allocated());
 }
 
+TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
+  using Map =
+      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+
+  Map m(11);
+  ASSERT_EQ(0u, m.size());
+
+  m.insert(2);
+  m.insert(3);
+  m.insert(5);
+  m.insert(7);
+  ASSERT_EQ(4u, m.size());
+
+  m.clear();
+  ASSERT_EQ(0u, m.size());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
deleted file mode 100644
index f0835772b8..0000000000
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ /dev/null
@@ -1,104 +0,0 @@
-/* The trivial 'src/build_common.sh' creates a config
- * that must stay in sync with this file.
- */
-#cmakedefine KOKKOS_FOR_SIERRA
-
-#if !defined(KOKKOS_FOR_SIERRA)
-
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error \
-    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA
-#cmakedefine KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_OPENMP
-#cmakedefine KOKKOS_ENABLE_THREADS
-#cmakedefine KOKKOS_ENABLE_SERIAL
-#cmakedefine KOKKOS_ENABLE_Winthread
-
-#cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
-#cmakedefine KOKKOS_ENABLE_LIBRT
-
-#cmakedefine KOKKOS_ENABLE_DEBUG
-#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-#cmakedefine KOKKOS_ENABLE_TUNING
-
-#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-
-// mfh 16 Sep 2014: If passed in on the command line, that overrides
-// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
-// warnings like this one:
-//
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
-// "KOKKOS_USE_CUDA_UVM" redefined
-//
-// At some point, we should edit the test-build scripts in
-// Trilinos/cmake/ctest/drivers/perseus/, and take
-// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
-// hesitate to do that now, because I'm not sure if all the files are
-// including KokkosCore_config.h (or a header file that includes it) like
-// they should.
-#ifndef KOKKOS_USE_CUDA_UVM
-#cmakedefine KOKKOS_USE_CUDA_UVM
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
-
-#endif
-
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_ISA_X86_64
-#cmakedefine KOKKOS_ENABLE_ISA_KNC
-#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
-#endif
-
-#ifdef KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#endif
-
-#cmakedefine KOKKOS_ARCH_ARMV80 1
-#cmakedefine KOKKOS_ARCH_ARMV81 1
-#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
-#cmakedefine KOKKOS_ARCH_AVX 1
-#cmakedefine KOKKOS_ARCH_AVX2 1
-#cmakedefine KOKKOS_ARCH_AVX512MIC 1
-#cmakedefine KOKKOS_ARCH_AVX512XEON 1
-#cmakedefine KOKKOS_ARCH_KNC 1
-#cmakedefine KOKKOS_ARCH_POWER8 1
-#cmakedefine KOKKOS_ARCH_POWER9 1
-#cmakedefine KOKKOS_ARCH_KEPLER 1
-#cmakedefine KOKKOS_ARCH_KEPLER30 1
-#cmakedefine KOKKOS_ARCH_KEPLER32 1
-#cmakedefine KOKKOS_ARCH_KEPLER35 1
-#cmakedefine KOKKOS_ARCH_KEPLER37 1
-#cmakedefine KOKKOS_ARCH_MAXWELL 1
-#cmakedefine KOKKOS_ARCH_MAXWELL50 1
-#cmakedefine KOKKOS_ARCH_MAXWELL52 1
-#cmakedefine KOKKOS_ARCH_MAXWELL53 1
-#cmakedefine KOKKOS_ARCH_PASCAL 1
-#cmakedefine KOKKOS_ARCH_PASCAL60 1
-#cmakedefine KOKKOS_ARCH_PASCAL61 1
-#cmakedefine KOKKOS_ARCH_VOLTA70 1
-
-// TODO: These are currently not used in Kokkos.  Should they be removed?
-#cmakedefine KOKKOS_ENABLE_MPI
-#cmakedefine KOKKOS_ENABLE_CUSPARSE
-
-// TODO: No longer options in Kokkos.  Need to be removed.
-#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
-
-#endif  // !defined(KOKKOS_FOR_SIERRA)
diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt
index 9ff4b6006d..a7c57a9434 100644
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@@ -10,9 +10,7 @@
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
 
 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
-IF (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
-         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   RETURN()
 ENDIF()
 
diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index dee21fd7a5..b534c32c52 100644
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << ", " << avg_seconds << std::endl;
   }
 }
 
diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
index c431c2b0c8..24c1898e0a 100644
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_hexgrad << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << avg_seconds << std::endl;
   }
 }
 
diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
index 50bbc78a6b..5b7c2a7a03 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
   timer.reset();
   Kokkos::parallel_reduce(
@@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
index 550316bec9..555a05ea27 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) {
   {
     Kokkos::Timer timer;
     for (int r = 0; r < R; r++) {
-      double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8);
+      double* a_ptr =
+          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
       Kokkos::parallel_for(
           N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
       Kokkos::fence();
diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
index afeeb64356..b0562f2fd1 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@@ -47,10 +47,18 @@
 namespace Test {
 
 TEST(default_exec, ViewResize_Rank8) {
+// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
+#ifdef KOKKOS_ENABLE_SYCL
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+#else
   printf("Resize View Performance for LayoutLeft:\n");
   run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
   printf("Resize View Performance for LayoutRight:\n");
   run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
index 59820f3bdd..54824e5b39 100644
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -47,7 +47,7 @@
 #include <cstdlib>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
@@ -401,7 +401,7 @@ template <class T>
 void Loop(int loop, int test, const char* type_name) {
   LoopVariant<T>(loop, test);
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   T res       = LoopVariant<T>(loop, test);
   double time = timer.seconds();
 
diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
index eec1c8eacc..4086ef5816 100644
--- a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@@ -12,13 +12,13 @@
 #include <typeinfo>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
 template <typename T>
 void test(const int length) {
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   using vector = Kokkos::View<T*, exec_space>;
 
diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp
index 9aab119774..7887d4ba55 100644
--- a/lib/kokkos/core/perf_test/test_mempool.cpp
+++ b/lib/kokkos/core/perf_test/test_mempool.cpp
@@ -48,7 +48,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@@ -100,7 +100,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+      ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
       if (ptrs(j)) ++update;
     }
@@ -129,7 +129,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      pool.deallocate((void*)ptrs(j), size_alloc);
+      pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
     }
   }
 
@@ -153,9 +153,9 @@ struct TestFunctor {
         for (unsigned k = 0; k < repeat_inner; ++k) {
           const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-          pool.deallocate((void*)ptrs(j), size_alloc);
+          pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
 
-          ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+          ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
           if (0 == ptrs(j)) update++;
         }
@@ -266,7 +266,7 @@ int main(int argc, char* argv[]) {
     TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                         fill_stride, chunk_span, repeat_inner);
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
 
     if (!functor.test_fill()) {
       Kokkos::abort("fill ");
diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp
index b2f936a955..49957ae932 100644
--- a/lib/kokkos/core/perf_test/test_taskdag.cpp
+++ b/lib/kokkos/core/perf_test/test_taskdag.cpp
@@ -56,7 +56,7 @@ int main() { return 0; }
 #include <cstdlib>
 #include <limits>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace = Kokkos::DefaultExecutionSpace;
 
@@ -220,7 +220,7 @@ int main(int argc, char* argv[]) {
     double time_sum = 0;
 
     for (int i = 0; i < test_repeat_outer; ++i) {
-      Kokkos::Impl::Timer timer;
+      Kokkos::Timer timer;
 
       Functor::FutureType ftmp =
           Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input));
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
index 2ab0989805..499736c60d 100644
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -9,6 +9,8 @@ INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
   FILES_MATCHING
+  PATTERN "*.inc"
+  PATTERN "*.inc_*"
   PATTERN "*.hpp"
   PATTERN "*.h"
 )
@@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+ENDIF()
+
+
 KOKKOS_ADD_LIBRARY(
   kokkoscore
   SOURCES ${KOKKOS_CORE_SRCS}
@@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+
+# FIXME: We need a proper solution to figure out whether to enable
+#        libatomic
+# XL requires libatomic even for 64 bit CAS, most others only for 128
+# I (CT) had removed 128bit CAS from desul to not need libatomic.
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+    (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
+  target_link_libraries(kokkoscore PUBLIC atomic)
+ENDIF()
+
+
+KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 916f109758..f6b2762403 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -90,43 +90,25 @@ static std::atomic<int> num_uvm_allocations(0);
 
 }  // namespace
 
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+void DeepCopyCuda(void *dst, const void *src, size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
 }
 
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
+void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
+                       size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
-  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
-  cudaStreamSynchronize(s);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+  Impl::cuda_stream_synchronize(
+      s,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
 }
 
 }  // namespace Impl
@@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
 
 namespace Kokkos {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 KOKKOS_DEPRECATED void CudaSpace::access_error() {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
@@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
       "non-Cuda space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
+#endif
 
 /*--------------------------------------------------------------------------*/
 
@@ -164,9 +148,11 @@ bool CudaUVMSpace::available() {
 
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 int CudaUVMSpace::number_of_allocations() {
   return Kokkos::Impl::num_uvm_allocations.load();
 }
+#endif
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
 // The purpose of the following variable is to allow a state-based choice
 // for pinning UVM allocations to the CPU. For now this is considered
@@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
 
 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 
+int memory_threshold_g = 40000;  // 40 kB
+
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1
 
@@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  cudaError_t error_code;
+  if (arg_alloc_size >= memory_threshold_g) {
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+#endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     cudaGetLastError();  // This is the only way to clear the last error, which
                          // we should do here since we're turning it into an
@@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation");
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate(
               CudaMallocManaged);
     }
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-
   try {
-    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+    if (arg_alloc_size >= memory_threshold_g) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    }
+#else
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#endif
   } catch (...) {
   }
 }
@@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate(
     ,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
   } catch (...) {
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation");
 }
 
 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
@@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
   } catch (...) {
   }
 }
@@ -462,7 +477,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
   resDesc.res.linear.sizeInBytes = alloc_size;
   resDesc.res.linear.devPtr      = alloc_ptr;
 
-  CUDA_SAFE_CALL(
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
 
   return tex_obj;
@@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
@@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 #endif
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
-                                        space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
+        ptr, bytes, space.cuda_device(), space.cuda_stream()));
   }
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 0f4259072d..993c8d1bba 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination,
     }
 
     if (blocks_per_sm >= min_blocks_per_sm) {
-      if (threads_per_sm >= opt_threads_per_sm) {
+      // The logic prefers smaller block sizes over larger ones to
+      // give more flexibility to the scheduler.
+      // But don't go below 128 where performance suffers significantly
+      // for simple copy/set kernels.
+      if ((threads_per_sm > opt_threads_per_sm) ||
+          ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) {
         opt_block_size     = block_size;
         opt_threads_per_sm = threads_per_sm;
       }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
index 4759001d81..36df0d2564 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -49,13 +49,19 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 #include <iosfwd>
 
 namespace Kokkos {
 namespace Impl {
 
-void cuda_device_synchronize();
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string& name);
+void cuda_device_synchronize(const std::string& name);
+void cuda_stream_synchronize(const cudaStream_t stream,
+                             const std::string& name);
 
 void cuda_internal_error_throw(cudaError e, const char* name,
                                const char* file = nullptr, const int line = 0);
@@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name,
   }
 }
 
-#define CUDA_SAFE_CALL(call) \
+#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \
   Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name,
+                                               const char* file = nullptr,
+                                               const int line   = 0) {
+  cuda_internal_safe_call(e, name, file, line);
+}
+
+#define CUDA_SAFE_CALL(call)                                              \
+  Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                   __LINE__)
+
+#endif
+
 }  // namespace Impl
 
 namespace Experimental {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
index 3de7a69916..bd514f5e88 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -60,6 +60,7 @@
 
 #include <Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -82,8 +83,8 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node,
-                                        error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
     // TODO @graphs print out errors
   }
 
@@ -107,26 +108,27 @@ struct GraphImpl<Kokkos::Cuda> {
     // TODO @graphs we need to somehow indicate the need for a fence in the
     //              destructor of the GraphImpl object (so that we don't have to
     //              just always do it)
-    m_execution_space.fence();
+    m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
     }
-    CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
-    CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node),
-                                         m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
   }
 
   template <class NodeImpl>
@@ -171,7 +173,7 @@ struct GraphImpl<Kokkos::Cuda> {
     auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node;
     KOKKOS_EXPECTS(bool(cuda_node))
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
   }
 
@@ -179,7 +181,7 @@ struct GraphImpl<Kokkos::Cuda> {
     if (!bool(m_graph_exec)) {
       _instantiate_graph();
     }
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
   }
 
@@ -192,9 +194,10 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(!bool(m_graph_exec))
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
index ec9c434fe6..c81286eb10 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@@ -51,6 +51,9 @@
     !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) ||  \
       defined(KOKKOS_ARCH_MAXWELL52))
 #include <cuda_fp16.h>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+#include <Kokkos_NumericTraits.hpp>  // reduction_identity
 
 #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
@@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
         cast_from_half(half_t);
 
-class half_t {
+class alignas(2) half_t {
  public:
   using impl_type = Kokkos::Impl::half_impl_t::type;
 
@@ -138,6 +141,22 @@ class half_t {
   KOKKOS_FUNCTION
   half_t() : val(0.0F) {}
 
+  // Copy constructors
+  KOKKOS_DEFAULTED_FUNCTION
+  half_t(const half_t&) noexcept = default;
+
+  KOKKOS_INLINE_FUNCTION
+  half_t(const volatile half_t& rhs) {
+#ifdef __CUDA_ARCH__
+    val = rhs.val;
+#else
+    const volatile uint16_t* rv_ptr =
+        reinterpret_cast<const volatile uint16_t*>(&rhs.val);
+    const uint16_t rv_val = *rv_ptr;
+    val                   = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // __CUDA_ARCH__
+  }
+
   // Don't support implicit conversion back to impl_type.
   // impl_type is a storage only type on host.
   KOKKOS_FUNCTION
@@ -219,7 +238,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = +tmp.val;
 #else
-    tmp.val   = __float2half(+__half2float(tmp.val));
+    tmp.val               = __float2half(+__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -230,7 +249,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = -tmp.val;
 #else
-    tmp.val   = __float2half(-__half2float(tmp.val));
+    tmp.val               = __float2half(-__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -241,7 +260,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     ++val;
 #else
-    float tmp = __half2float(val);
+    float tmp             = __half2float(val);
     ++tmp;
     val       = __float2half(tmp);
 #endif
@@ -255,7 +274,7 @@ class half_t {
 #else
     float tmp = __half2float(val);
     --tmp;
-    val = __float2half(tmp);
+    val     = __float2half(tmp);
 #endif
     return *this;
   }
@@ -290,7 +309,10 @@ class half_t {
 
   template <class T>
   KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    val = cast_to_half(rhs).val;
+    impl_type new_val = cast_to_half(rhs).val;
+    volatile uint16_t* val_ptr =
+        reinterpret_cast<volatile uint16_t*>(const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<uint16_t&>(new_val);
   }
 
   // Compound operators
@@ -299,30 +321,21 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val += rhs.val;
 #else
-    val = __float2half(__half2float(val) + __half2float(rhs.val));
+    val     = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator+=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) + rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator+=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
   }
 
-  // Compund operators: upcast overloads for +=
+  // Compound operators: upcast overloads for +=
   template <class T>
   KOKKOS_FUNCTION std::enable_if_t<
       std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
@@ -350,27 +363,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val -= rhs.val;
 #else
-    val          = __float2half(__half2float(val) - __half2float(rhs.val));
+    val     = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator-=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) - rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator-=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for -=
@@ -401,27 +405,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val *= rhs.val;
 #else
-    val          = __float2half(__half2float(val) * __half2float(rhs.val));
+    val     = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator*=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) * rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator*=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for *=
@@ -452,27 +447,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val /= rhs.val;
 #else
-    val          = __float2half(__half2float(val) / __half2float(rhs.val));
+    val     = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator/=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) / rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator/=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for /=
@@ -504,7 +490,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val += rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -529,7 +515,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val -= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -554,7 +540,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val *= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -579,7 +565,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val /= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -683,6 +669,62 @@ class half_t {
     return __half2float(val) >= __half2float(rhs.val);
 #endif
   }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os, const half_t& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, half_t& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
 };
 
 // CUDA before 11.1 only has the half <-> float conversions marked host device
@@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif
 }  // namespace Experimental
+
+// use float as the return type for sum and prod since cuda_fp16.h
+// has no constexpr functions for casting to __half
+template <>
+struct reduction_identity<Kokkos::Experimental::half_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -65504.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 65504.0F;
+  }
+};
+
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
 #endif  // KOKKOS_ENABLE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 016cb6cdcb..6964d5b41b 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -119,7 +119,7 @@ int cuda_kernel_arch() {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  cudaMalloc((void **)&d_arch, sizeof(int));
+  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
   cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
@@ -141,7 +141,36 @@ bool cuda_launch_blocking() {
 
 }  // namespace
 
-void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); }
+void cuda_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      });
+}
+
+void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
+                             const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          ptr->impl_get_instance_id()},
+      [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
+
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name, reason, [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
 
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
@@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() {
   // See 'cudaSetDeviceFlags' for host-device thread interaction
   // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
 
-  CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
 
   if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_cudaDevCount; ++i) {
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
   }
 }
 
@@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() {
               << std::endl;
   }
 
-  m_cudaDev                   = -1;
-  m_cudaArch                  = -1;
-  m_multiProcCount            = 0;
-  m_maxWarpCount              = 0;
-  m_maxBlock                  = 0;
-  m_maxSharedWords            = 0;
-  m_maxConcurrency            = 0;
-  m_scratchSpaceCount         = 0;
-  m_scratchFlagsCount         = 0;
-  m_scratchUnifiedCount       = 0;
-  m_scratchUnifiedSupported   = 0;
-  m_streamCount               = 0;
-  m_scratchSpace              = nullptr;
-  m_scratchFlags              = nullptr;
-  m_scratchUnified            = nullptr;
-  m_scratchConcurrentBitset   = nullptr;
-  m_stream                    = nullptr;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_cudaDev                 = -1;
+  m_cudaArch                = -1;
+  m_multiProcCount          = 0;
+  m_maxWarpCount            = 0;
+  m_maxBlock                = 0;
+  m_maxSharedWords          = 0;
+  m_maxConcurrency          = 0;
+  m_scratchSpaceCount       = 0;
+  m_scratchFlagsCount       = 0;
+  m_scratchUnifiedCount     = 0;
+  m_scratchUnifiedSupported = 0;
+  m_streamCount             = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 int CudaInternal::verify_is_initialized(const char *const label) const {
@@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const {
   }
   return 0 <= m_cudaDev;
 }
-
+uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; }
 CudaInternal &CudaInternal::singleton() {
   static CudaInternal self;
   return self;
 }
+void CudaInternal::fence(const std::string &name) const {
+  Impl::cuda_stream_synchronize(m_stream, this, name);
+}
 void CudaInternal::fence() const {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+  fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
-void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
+void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream,
+                              bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
@@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
     m_cudaDev    = cuda_device_id;
     m_deviceProp = cudaProp;
 
-    CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
-    Kokkos::Impl::cuda_device_synchronize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+    Kokkos::Impl::cuda_device_synchronize(
+        "Kokkos::CudaInternal::initialize: Fence on space initialization");
 
     // Query what compute capability architecture a kernel executes:
     m_cudaArch = cuda_kernel_arch();
@@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
-                                sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
+                                            sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
   if (stream == nullptr) {
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
-                                  CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
+                       CudaTraits::ConstantMemoryUsage));
 
-    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
   }
 
-  m_stream                    = stream;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor(
   return m_scratchFunctor;
 }
 
-void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
+std::pair<void *, int> CudaInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+
+  int current_team_scratch = 0;
+  int zero                 = 0;
+  int one                  = 1;
+  while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak(
+      zero, one, std::memory_order_release, std::memory_order_relaxed)) {
+    current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch;
   }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-        m_team_scratch_ptr, m_team_scratch_current_size);
+  if (m_team_scratch_current_size[current_team_scratch] == 0) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
+            "Kokkos::CudaSpace::TeamScratchMemory",
+            m_team_scratch_current_size[current_team_scratch]);
   }
-  return m_team_scratch_ptr;
+  if ((bytes > m_team_scratch_current_size[current_team_scratch]) ||
+      ((bytes < m_team_scratch_current_size[current_team_scratch]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
+            m_team_scratch_ptr[current_team_scratch],
+            m_team_scratch_current_size[current_team_scratch]);
+  }
+  return std::make_pair(m_team_scratch_ptr[current_team_scratch],
+                        current_team_scratch);
 }
 
 //----------------------------------------------------------------------------
@@ -685,36 +742,43 @@ void CudaInternal::finalize() {
     if (m_scratchFunctorSize > 0)
       RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
 
-    if (m_team_scratch_current_size > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr);
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      if (m_team_scratch_current_size[i] > 0)
+        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+    }
 
-    m_cudaDev                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = 0;
-    m_maxSharedWords            = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchUnifiedCount       = 0;
-    m_streamCount               = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_scratchUnified            = nullptr;
-    m_scratchConcurrentBitset   = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+    m_cudaDev                 = -1;
+    m_multiProcCount          = 0;
+    m_maxWarpCount            = 0;
+    m_maxBlock                = 0;
+    m_maxSharedWords          = 0;
+    m_scratchSpaceCount       = 0;
+    m_scratchFlagsCount       = 0;
+    m_scratchUnifiedCount     = 0;
+    m_streamCount             = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
   }
 
   // only destroy these if we're finalizing the singleton
   if (this == &singleton()) {
-    cudaFreeHost(constantMemHostStaging);
-    cudaEventDestroy(constantMemReusable);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
     auto &deep_copy_space =
         Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
     if (deep_copy_space)
       deep_copy_space->impl_internal_space_instance()->finalize();
-    cudaStreamDestroy(cuda_get_deep_copy_stream());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
   }
 }
 
@@ -823,7 +887,7 @@ Cuda::Cuda()
       "Cuda instance constructor");
 }
 
-Cuda::Cuda(cudaStream_t stream)
+Cuda::Cuda(cudaStream_t stream, bool manage_stream)
     : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
         ptr->finalize();
         delete ptr;
@@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream)
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
   m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
-                               stream);
+                               stream, manage_stream);
 }
 
 void Cuda::print_configuration(std::ostream &s, const bool) {
   Impl::CudaInternal::singleton().print_configuration(s);
 }
 
-void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
+void Cuda::impl_static_fence(const std::string &name) {
+  Kokkos::Impl::cuda_device_synchronize(name);
+}
+void Cuda::impl_static_fence() {
+  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
+}
 
-void Cuda::fence() const { m_space_instance->fence(); }
+void Cuda::fence() const {
+  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
+}
+void Cuda::fence(const std::string &name) const {
+  m_space_instance->fence(name);
+}
 
 const char *Cuda::name() { return "Cuda"; }
+uint32_t Cuda::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
 
 cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
@@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) {
   }
 }
 
-void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
+void CudaSpaceInitializer::fence() {
+  Kokkos::Cuda::impl_static_fence(
+      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
+}
+void CudaSpaceInitializer::fence(const std::string &name) {
+  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
+  // "+name); //TODO: or this
+  Kokkos::Cuda::impl_static_fence(name);
+}
 
 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                                const bool detail) {
@@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
   msg << "yes\n";
 #else
   msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
-#ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes\n";
-#else
-  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index aaec2c2926..7eb169838c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -3,6 +3,9 @@
 
 #include <vector>
 #include <impl/Kokkos_Tools.hpp>
+#include <atomic>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@@ -114,10 +117,14 @@ class CudaInternal {
   mutable size_type* m_scratchFunctor;
   uint32_t* m_scratchConcurrentBitset;
   cudaStream_t m_stream;
+  uint32_t m_instance_id;
+  bool m_manage_stream;
 
   // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void* m_team_scratch_ptr;
+  int m_n_team_scratch = 10;
+  mutable int64_t m_team_scratch_current_size[10];
+  mutable void* m_team_scratch_ptr[10];
+  mutable std::atomic_int m_team_scratch_pool[10];
 
   bool was_initialized = false;
   bool was_finalized   = false;
@@ -135,7 +142,8 @@ class CudaInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream&) const;
@@ -145,6 +153,7 @@ class CudaInternal {
   static void cuda_set_serial_execution(bool);
 #endif
 
+  void fence(const std::string&) const;
   void fence() const;
 
   ~CudaInternal();
@@ -175,20 +184,68 @@ class CudaInternal {
         m_scratchFunctor(nullptr),
         m_scratchConcurrentBitset(nullptr),
         m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+        m_instance_id(
+            Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::Cuda>(
+                reinterpret_cast<uintptr_t>(this))) {
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+      m_team_scratch_pool[i]         = 0;
+    }
+  }
 
   // Resizing of reduction related scratch spaces
   size_type* scratch_space(const size_type size) const;
   size_type* scratch_flags(const size_type size) const;
   size_type* scratch_unified(const size_type size) const;
   size_type* scratch_functor(const size_type size) const;
-
+  uint32_t impl_get_instance_id() const;
   // Resizing of team level 1 scratch
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  std::pair<void*, int> resize_team_scratch_space(std::int64_t bytes,
+                                                  bool force_shrink = false);
 };
 
 }  // Namespace Impl
+
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_Cuda_instances(std::vector<Cuda>& instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
+    instances[s] = Cuda(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<Cuda> partition_space(const Cuda&, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<Cuda> instances(sizeof...(Args));
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<Cuda> instances(weights.size());
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+}  // namespace Experimental
+
 }  // Namespace Kokkos
 #endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index d892a893b3..4b01798f5e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func,
 #ifndef KOKKOS_ARCH_KEPLER
   // On Kepler the L1 has no benefit since it doesn't cache reads
   auto set_cache_config = [&] {
-    CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
         func,
         (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
     return prefer_shmem;
@@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker<
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
     // Wait until the previous kernel that uses the constant buffer is done
-    CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventSynchronize(cuda_instance->constantMemReusable));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
     unsigned long* staging = cuda_instance->constantMemHostStaging;
@@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker<
          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
 
     // Record an event that says when the constant buffer can be reused
-    CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
-                                   cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventRecord(cuda_instance->constantMemReusable,
+                        cudaStream_t(cuda_instance->m_stream)));
   }
 
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
@@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl<
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      CUDA_SAFE_CALL(cudaGetLastError());
-      cuda_instance->fence();
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      cuda_instance->fence(
+          "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
     }
   }
@@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl<
     // the code and the result is visible.
     auto wrap_get_attributes = []() -> cudaFuncAttributes {
       cudaFuncAttributes attr_tmp;
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
       return attr_tmp;
     };
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index ff31649544..1f3024f318 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -81,22 +81,34 @@ namespace Impl {
 CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+#endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                            sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
-                            sizeof(int) * (Cuda::concurrency())));
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                                        sizeof(int) * (Cuda::concurrency())));
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256,
                                     256>>>(Kokkos::Cuda::concurrency());
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
 }
 
 void finalize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_cuda_lock_arrays.atomic == nullptr) return;
   cudaFree(g_host_cuda_lock_arrays.atomic);
   g_host_cuda_lock_arrays.atomic = nullptr;
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index 7640b8084d..04fb7cb345 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -53,6 +53,10 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_Cuda.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
+
 /* Dan Ibanez: it is critical that this code be a macro, so that it will
    capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
    putting this in an inline function will NOT do the right thing! */
 #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
   {                                                                   \
     if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      CUDA_SAFE_CALL(                                                 \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
           cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
                              &Kokkos::Impl::g_host_cuda_lock_arrays,  \
                              sizeof(Kokkos::Impl::CudaLockArrays)));  \
@@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     lock_array_copied = 1;                                            \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 2834e6f3de..f83b43e608 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -62,7 +62,6 @@
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -240,9 +239,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
   //----------------------------------------
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int impl_vector_length() const { return m_vector_length; }
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
@@ -687,6 +688,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
 
   template <class TagType>
   __device__ inline
@@ -797,15 +799,19 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
     m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -829,6 +835,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
     }
   }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -870,9 +884,24 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using value_type     = typename ValueTraits::value_type;
   using reference_type = typename ValueTraits::reference_type;
   using functor_type   = FunctorType;
-  using size_type      = Kokkos::Cuda::size_type;
-  using index_type     = typename Policy::index_type;
-  using reducer_type   = ReducerType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = typename std::conditional<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
+      Kokkos::Cuda::size_type>::type;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
@@ -883,9 +912,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
 
   // Shall we use the shfl based reduction or not (only use it for static sized
   // types of more than 128bit)
@@ -924,16 +955,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       __device__ inline
       void run(const DummySHMEMReductionType& ) const
       {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
+    const integral_nonzero_constant<
+        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
         word_count(ValueTraits::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     {
       reference_type value =
           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
+                          kokkos_impl_cuda_shared_memory<word_size_type>() +
                               threadIdx.y * word_count.value);
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -958,11 +989,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -985,17 +1017,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
                                                 WorkTagFwd>(
                 ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
+                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
                 m_scratch_space, m_scratch_flags)) {
           // This is the final block with the final result at the final threads'
           // location
 
-          size_type* const shared =
-              kokkos_impl_cuda_shared_memory<size_type>() +
+          word_size_type* const shared =
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
               (blockDim.y - 1) * word_count.value;
-          size_type* const global =
+          word_size_type* const global =
               m_result_ptr_device_accessible
-                  ? reinterpret_cast<size_type*>(m_result_ptr)
+                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
                   : (m_unified_space ? m_unified_space : m_scratch_space);
 
           if (threadIdx.y == 0) {
@@ -1100,15 +1132,21 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
       KOKKOS_ASSERT(block_size > 0);
 
-      m_scratch_space = cuda_internal_scratch_space(
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
                                 m_functor, m_reducer)) *
                                 block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(),
+              ValueTraits::value_size(
+                  ReducerConditional::select(m_functor, m_reducer))));
 
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
@@ -1139,7 +1177,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1459,7 +1499,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+            "Result Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1580,6 +1622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -1821,7 +1864,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
           true);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1895,16 +1940,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<std::int64_t>(m_scratch_size[1]) *
-                      (static_cast<std::int64_t>(
-                          Cuda::concurrency() /
-                          (m_team_size * m_vector_size))));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -1973,6 +2021,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     cudaFuncAttributes attr =
         CudaParallelLaunch<ParallelReduce,
                            LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -1994,15 +2044,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -2030,13 +2084,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                       "large team size."));
     }
   }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -2167,9 +2236,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK                            = __activemask();
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
       __syncthreads();  // Don't overwrite previous iteration values until they
@@ -2182,11 +2249,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
@@ -2457,9 +2520,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK = __activemask();
 
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
@@ -2474,11 +2535,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index fc9fc3770b..e5b05bcc64 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction(
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
   // The last block has in its thread=0 the global reduction value through
@@ -388,48 +368,28 @@ __device__ inline
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
 
@@ -573,23 +533,17 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
                                // part of the reduction
       const int width)         // How much of the warp participates
   {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
     unsigned mask =
         width == 32
             ? 0xffffffff
             : ((1 << width) - 1)
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
-#endif
     const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < 32) {
         ValueJoin::join(functor, value, value + delta);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(mask);
     }
     *value = *(value - lane_id);
   }
@@ -612,17 +566,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
       const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32;
       if (delta < blockDim.x * blockDim.y)
         *my_shared_team_buffer_element = shared_team_buffer_element[delta];
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
       scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false,
                                   blockDim.x * blockDim.y / 32);
       if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
     }
   }
 
+  template <class SizeType = Cuda::size_type>
   __device__ static inline bool scalar_inter_block_reduction(
       const FunctorType& functor, const Cuda::size_type /*block_id*/,
-      const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-      Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+      const Cuda::size_type block_count, SizeType* const shared_data,
+      SizeType* const global_data, Cuda::size_type* const global_flags) {
     Scalar* const global_team_buffer_element = ((Scalar*)global_data);
     Scalar* const my_global_team_buffer_element =
         global_team_buffer_element + blockIdx.x;
@@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan(
   const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
 
   {  // Intra-warp reduction:
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
   __syncthreads();  // Wait for all warps to reduce
@@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan(
     const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
                                 << CudaTraits::WarpIndexShift;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    unsigned inner_mask =
-        KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y));
-#endif
+    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
     if (rtid_inter < blockDim.y) {
       const pointer_type tdata_inter =
           base_data + value_count * (rtid_inter ^ BlockSizeMask);
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
       if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
       }
       if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
       }
       if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
       }
       if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
       }
       if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
       }
-#else
-      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
-      }
-      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
-      }
-      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
-      }
-      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
-      }
-      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
-      }
-#endif
 
       if (DoScan) {
         int n =
@@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan(
 
         if (!(rtid_inter + n < blockDim.y)) n = 0;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#else
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#endif
       }
     }
   }
@@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan(
                                               : ((rtid_intra & 16) ? 16 : 0))));
 
     if (!(rtid_intra + n < blockDim.y)) n = 0;
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
 #undef BLOCK_SCAN_STEP
@@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = Cuda::size_type;
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
+  using size_type   = SizeType;
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
   using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
@@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
   return is_last_block;
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   if (!DoScan && ValueTraits::StaticValueSize > 0)
     return Kokkos::Impl::CudaReductionsFunctor<
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 2004edbeac..88ac0d1878 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -54,11 +54,27 @@
 #include <Kokkos_Core_fwd.hpp>
 
 #include <impl/Kokkos_TaskBase.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // KOKKOS_IMPL_CUDA_SAFE_CALL
 #include <impl/Kokkos_TaskTeamMember.hpp>
 
 //----------------------------------------------------------------------------
 
+#if defined(__CUDA_ARCH__)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
+  {                                                                        \
+    __syncwarp();                                                          \
+    const unsigned b = __activemask();                                     \
+    if (b != 0xffffffff) {                                                 \
+      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
+             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
+             threadIdx.z, b);                                              \
+      return;                                                              \
+    }                                                                      \
+  }
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
+#endif
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@@ -138,13 +154,13 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
       // Broadcast task pointer:
 
       // Sync before the broadcast
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // pretend it's an int* for shuffle purposes
       ((int*)&current_task)[0] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[0], 0, 32);
       ((int*)&current_task)[1] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[1], 0, 32);
 
       if (current_task) {
         KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
@@ -168,7 +184,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (shared_memory_task_copy->is_team_runnable()) {
           // Thread Team Task
@@ -182,7 +198,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
         // b -= b % CudaTraits::WarpSize;
@@ -196,7 +212,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // writes are visible to root thread of the warp for
         // respawn or completion.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (warp_lane == 0) {
           // If respawn requested copy respawn data back to main memory
@@ -249,12 +265,14 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     auto& queue = scheduler.queue();
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Task Execution");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -262,18 +280,21 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     const size_t larger_stack_size = 1 << 11;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Task Execution");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -295,13 +316,17 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Get Function Pointer for Tasks");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Get Function Pointer for Tasks");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained<
           // count of 0 also. Otherwise, returns a task from another queue
           // or `end` if one couldn't be popped
           task_ptr = team_queue.attempt_to_steal_task();
-#if 0
-          if(task != no_more_tasks_sentinel && task != end) {
-            std::printf("task stolen on rank %d\n", team_exec.league_rank());
-          }
-#endif
         }
       }
 
       // Synchronize warp with memory fence before broadcasting task pointer:
 
       // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // Broadcast task pointer:
 
-      ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
-      ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
+      ((int*)&task_ptr)[0] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32);
+      ((int*)&task_ptr)[1] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32);
 
 #if defined(KOKKOS_ENABLE_DEBUG)
       KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
@@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (task_root_type::TaskTeam == task_shmem->m_task_type) {
           // Thread Team Task
@@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // copy task closure from shared to global memory:
 
@@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained<
         // respawn or completion.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // If respawn requested copy respawn data back to main memory
 
@@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained<
     auto& queue = scheduler.queue();
     queue.initialize_team_queues(warps_per_block * grid.x);
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Pre Execute Task");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained<
     const size_t larger_stack_size = 2048;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Post Execute Task");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained<
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Pre Get Function Pointer");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -609,7 +640,7 @@ class TaskExec<Kokkos::Cuda, Scheduler> {
 
   __device__ void team_barrier() const {
     if (1 < m_team_size) {
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
     }
   }
 
@@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single(
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN
+
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index e780639015..922b980a25 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -340,191 +340,6 @@ class CudaTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __CUDA_ARCH__
-
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of CUDA threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn =
-        (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift;
-    const int wx = tid & CudaTraits::WarpIndexMask;
-    const int wy = tid >> CudaTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing CUDA threads
-
-      value_type tmp(reducer.reference());
-
-      for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) {
-        Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                 CudaTraits::WarpSize);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        ((value_type*)shmem)[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join(((value_type*)shmem) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy(((pointer_type)global_scratch_space) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = (int)gridDim.x ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      last_block = __syncthreads_or(last_block);
-
-      if (!last_block) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(((pointer_type)shmem) + offset,
-                   ((pointer_type)global_scratch_space) + offset);
-
-      for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) {
-        reducer.join(
-            ((pointer_type)shmem) + offset,
-            ((pointer_type)global_scratch_space) + i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      const int nreduce =
-          CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry;
-
-      // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), (pointer_type)shmem);
-        return 1;
-      }
-    }
-    return 0;
-
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
@@ -533,7 +348,7 @@ class CudaTeamMember {
                  void* scratch_level_1_ptr, const int scratch_level_1_size,
                  const int arg_league_rank, const int arg_league_size)
       : m_team_reduce(shared),
-        m_team_shared(((char*)shared) + shared_begin, shared_size,
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
         m_league_rank(arg_league_rank),
@@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
        i += blockDim.x) {
     closure(i);
   }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
index 7f7b7b6e78..31d3c47e1c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -48,7 +48,12 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <type_traits>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+
+#if !defined(KOKKOS_COMPILER_CLANG)
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
+#else
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
+#endif
 
 namespace Kokkos {
 
@@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu;
 // Shuffle operations require input to be a register (stack) variable
 
 // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width),
-// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK
+// which turns in to one of __shfl_sync(_up|_down)
 // Since the logic with respect to value sizes, etc., is the same everywhere,
 // put it all in one place.
 template <class Derived>
@@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width);
+    return __shfl_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> {
   __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val,
                                                   int lane, int width) const
       noexcept {
-    return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width);
+    return __shfl_up_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width);
+    return __shfl_down_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width,
 
 }  // end namespace Kokkos
 
+#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF
+
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 #endif  // !defined( KOKKOS_CUDA_VECTORIZATION_HPP )
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
deleted file mode 100644
index 0cdd84ce27..0000000000
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <Kokkos_Macros.hpp>
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
-#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
-#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x)
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x)
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \
-  __shfl_down_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z)
-#else
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
-#define KOKKOS_IMPL_CUDA_SYNCWARP
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
-#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0
-#endif
-
-#if !defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
-#else
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
-#endif
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
-  {                                                                        \
-    __syncwarp();                                                          \
-    const unsigned b = __activemask();                                     \
-    if (b != 0xffffffff) {                                                 \
-      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
-             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
-             threadIdx.z, b);                                              \
-      return;                                                              \
-    }                                                                      \
-  }
-#else
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
-#endif
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 9278d1bdc9..7eb3e1e9f7 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -45,6 +45,7 @@
 #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 
+#include <functional>
 #include <Kokkos_Macros.hpp>
 
 #if defined(__HIPCC__)
@@ -56,118 +57,239 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  // FIXME_HIP - currently the "constant" path is unimplemented.
-  //             we should look at whether it's functional, and
-  //             perform some simple scaling studies to see when /
-  //             if the constant launcher outperforms the current
-  //             pass by pointer shared launcher
-  HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      numBlocks,
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>,
-      blockSize, sharedmem));
-}
+enum class BlockType { Max, Preferred };
 
-template <typename DriverType, bool constant>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
-      numBlocks, blockSize, sharedmem);
-}
-
-template <class FunctorType, class LaunchBounds, typename F>
-int hip_internal_get_block_size(const F &condition_check,
-                                const HIPInternal *hip_instance,
-                                const hipFuncAttributes &attr,
-                                const FunctorType &f,
-                                const size_t vector_length,
-                                const size_t shmem_block,
-                                const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? HIPTraits::MaxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
-
-  const int regs_per_wavefront  = std::max(attr.numRegs, 1);
-  const int regs_per_sm         = hip_instance->m_regsPerSM;
-  const int shmem_per_sm        = hip_instance->m_shmemPerSM;
-  const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = hip_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = hip_instance->m_maxThreadsPerSM;
-
-  int block_size = max_threads_per_block;
-  KOKKOS_ASSERT(block_size > 0);
-  const int blocks_per_warp =
-      (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize;
-
-  int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-      f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
-  }
-  int opt_block_size =
-      (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
-  int opt_threads_per_sm = threads_per_sm;
-  block_size -= HIPTraits::WarpSize;
-  while (condition_check(blocks_per_sm) &&
-         (block_size >= HIPTraits::WarpSize)) {
-    functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-        f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_preferred_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    if (HIPParallelLaunch<DriverType, LaunchBounds,
+                          LaunchMechanism>::get_scratch_size() > 0) {
+      return HIPTraits::ConservativeThreadsPerBlock;
     }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// FIXME_HIP - entire function could be constexpr for c++17
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_max_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    // we can always fit 1024 threads blocks if we only care about registers
+    // ... and don't mind spilling
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// convenience method to select and return the proper function attributes
+// for a kernel, given the launch bounds et al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          BlockType BlockSize = BlockType::Max,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+hipFuncAttributes get_hip_func_attributes_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    return HIPParallelLaunch<DriverType, LaunchBounds,
+                             LaunchMechanism>::get_hip_func_attributes();
+  } else {
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      return HIPParallelLaunch<
+          DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+          LaunchMechanism>::get_hip_func_attributes();
+    } else {
+      const int blocksize =
+          get_preferred_blocksize_impl<DriverType, LaunchBounds,
+                                       LaunchMechanism>();
+      if (blocksize == HIPTraits::MaxThreadsPerBlock) {
+        return HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
+      } else {
+        return HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
       }
     }
-    block_size -= HIPTraits::WarpSize;
   }
-  return opt_block_size;
 }
 
-template <class FunctorType, class LaunchBounds>
-int hip_get_max_block_size(const HIPInternal *hip_instance,
-                           const hipFuncAttributes &attr, const FunctorType &f,
-                           const size_t vector_length, const size_t shmem_block,
-                           const size_t shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Given an initial block-size limitation based on register usage
+// determine the block size to select based on LDS limitation
+template <BlockType BlockSize, class DriverType, class LaunchBounds,
+          typename ShmemFunctor>
+unsigned hip_internal_get_block_size(const HIPInternal *hip_instance,
+                                     const ShmemFunctor &f,
+                                     const unsigned tperb_reg) {
+  // translate LB from CUDA to HIP
+  const unsigned min_waves_per_eu =
+      LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1;
+  const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize;
+  const unsigned shmem_per_sm       = hip_instance->m_shmemPerSM;
+  unsigned block_size               = tperb_reg;
+  do {
+    unsigned total_shmem = f(block_size);
+    // find how many threads we can fit with this blocksize based on LDS usage
+    unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size;
+
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      // we want the maximum blocksize possible
+      // just wait until we get a case where we can fit the LDS per SM
+      if (tperb_shmem) return block_size;
+    } else {
+      if (block_size == tperb_reg && tperb_shmem >= tperb_reg) {
+        // fast path for exit on first iteration if registers are more limiting
+        // than LDS usage, just use the register limited size
+        return tperb_reg;
+      }
+      // otherwise we need to apply a heuristic to choose the blocksize
+      // the current launchbound selection scheme is:
+      //      1. If no spills, choose 1024 [MaxThreadsPerBlock]
+      //      2. Otherwise, choose 256 [ConservativeThreadsPerBlock]
+      //
+      // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB
+      // and we'll already have pretty decent occupancy, thus dropping to 256
+      // *probably* isn't a concern
+      const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem;
+      const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg;
+
+      // for anything with > 4 WF's & can fit multiple blocks
+      // we're probably not occupancy limited so just return that
+      if (blocks_per_cu_shmem > 1 &&
+          tperb > HIPTraits::ConservativeThreadsPerBlock) {
+        return block_size;
+      }
+
+      // otherwise, it's probably better to drop to the first valid size that
+      // fits in the ConservativeThreadsPerBlock
+      if (tperb >= min_threads_per_sm) return block_size;
+    }
+    block_size >>= 1;
+  } while (block_size >= HIPTraits::WarpSize);
+  // TODO: return a negative, add an error to kernel launch
+  return 0;
 }
 
-template <typename FunctorType, typename LaunchBounds>
-int hip_get_opt_block_size(HIPInternal const *hip_instance,
-                           hipFuncAttributes const &attr, FunctorType const &f,
-                           size_t const vector_length, size_t const shmem_block,
-                           size_t const shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int) { return true; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the preferred blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_preferred_blocksize() {
+  return get_preferred_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the max blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_max_blocksize() {
+  return get_max_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance,
+                                     ShmemFunctor const &f) {
+  // get preferred blocksize limited by register usage
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
+                                          ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds,
+                                   BlockType::Preferred>();
+  // get preferred blocksize limited by register usage
+  using namespace std::placeholders;
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_max_blocksize(HIPInternal const *hip_instance,
+                               ShmemFunctor const &f) {
+  // get max blocksize limited by register usage
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
+                                    ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
+  // get max blocksize
+  using namespace std::placeholders;
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
 }
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
index b3480bcad0..a75e7a4a6c 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name,
   }
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name,
+                                              const char* file = nullptr,
+                                              const int line   = 0) {
+  hip_internal_safe_call(e, name, file, line);
+}
+
+#endif
+
 }  // namespace Impl
 }  // namespace Kokkos
 
-#define HIP_SAFE_CALL(call) \
+#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \
   Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define HIP_SAFE_CALL(call)                                              \
+  Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                  __LINE__)
+
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 18ef10e22c..336ac8c698 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -77,7 +77,7 @@ class HIPInternalDevices {
 };
 
 HIPInternalDevices::HIPInternalDevices() {
-  HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
 
   if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_hipDevCount; ++i) {
-    HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
   }
 }
 
@@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() {
 }
 }  // namespace
 
+unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr;
+hipEvent_t Impl::HIPInternal::constantMemReusable        = nullptr;
+
 namespace Impl {
 
 //----------------------------------------------------------------------------
@@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const {
   return 0 <= m_hipDev;
 }
 
+uint32_t HIPInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
 HIPInternal &HIPInternal::singleton() {
   static HIPInternal *self = nullptr;
   if (!self) {
@@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() {
 }
 
 void HIPInternal::fence() const {
-  HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
-  // can reset our cycle id now as well
-  m_cycleId = 0;
+  fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence");
+}
+void HIPInternal::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_get_instance_id()},
+      [&]() {
+        KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+        // can reset our cycle id now as well
+        m_cycleId = 0;
+      });
 }
 
-void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
+void HIPInternal::initialize(int hip_device_id, hipStream_t stream,
+                             bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
@@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_hipDev     = hip_device_id;
     m_deviceProp = hipProp;
 
-    HIP_SAFE_CALL(hipSetDevice(m_hipDev));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev));
 
     m_stream                    = stream;
+    m_manage_stream             = manage_stream;
     m_team_scratch_current_size = 0;
     m_team_scratch_ptr          = nullptr;
 
@@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     // theoretically, we can get 40 WF's / CU, but only can sustain 32
     // see
     // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
-    m_maxBlocksPerSM = 32;
+    m_maxWavesPerCU = 32;
     // FIXME_HIP - Nick to implement this upstream
     //             Register count comes from Sec. 2.2. "Data Sharing" of the
     //             Vega 7nm ISA document (see the diagram)
@@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_regsPerSM        = 65536;
     m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
     m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
-    m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
+    m_maxThreadsPerSM  = m_maxWavesPerCU * HIPTraits::WarpSize;
     //----------------------------------
     // Multiblock reduction uses scratch flags for counters
     // and scratch space for partial reduction values.
@@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
-                              sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
+                                          sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
   // Init the array for used for arbitrarily sized atomics
   if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (m_stream == nullptr) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging,
+                                            HIPTraits::ConstantMemoryUsage));
+
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable));
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------
 
 void HIPInternal::finalize() {
-  this->fence();
+  this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordHIP =
@@ -378,6 +405,9 @@ void HIPInternal::finalize() {
     if (m_team_scratch_current_size > 0)
       Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
 
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+
     m_hipDev                    = -1;
     m_hipArch                   = -1;
     m_multiProcCount            = 0;
@@ -395,28 +425,36 @@ void HIPInternal::finalize() {
     m_team_scratch_ptr          = nullptr;
   }
   if (nullptr != d_driverWorkArray) {
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     d_driverWorkArray = nullptr;
   }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
   std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
   if (d_driverWorkArray == nullptr) {
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
   }
   if (driverTypeSize > m_maxDriverTypeSize) {
     // fence handles the cycle id reset for us
-    fence();
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    fence(
+        "Kokkos::HIPInternal::get_next_driver: fence before reallocating "
+        "resources");
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     m_maxDriverTypeSize = driverTypeSize;
     if (m_maxDriverTypeSize % 128 != 0)
       m_maxDriverTypeSize =
           m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
@@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
     m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
     if (m_cycleId == 0) {
       // ensure any outstanding kernels are completed before we wrap around
-      fence();
+      fence(
+          "Kokkos::HIPInternal::get_next_driver: fence before reusing first "
+          "driver");
     }
   }
   return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
@@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
 
 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+void hip_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
 
 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                               const int line) {
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index f4f88628e3..967c6fdd4b 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -48,6 +48,7 @@
 #define KOKKOS_HIP_INSTANCE_HPP
 
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>
 
 #include <mutex>
 
@@ -59,10 +60,12 @@ struct HIPTraits {
   static int constexpr WarpSize       = 64;
   static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
   static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+  static int constexpr ConservativeThreadsPerBlock =
+      256;  // conservative fallback blocksize in case of spills
   static int constexpr MaxThreadsPerBlock =
-      1024;  // FIXME_HIP -- assumed constant for now
-
+      1024;  // the maximum we can fit in a block
   static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
   static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };
 
@@ -90,7 +93,7 @@ class HIPInternal {
   unsigned m_multiProcCount = 0;
   unsigned m_maxWarpCount   = 0;
   unsigned m_maxBlock       = 0;
-  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxWavesPerCU  = 0;
   unsigned m_maxSharedWords = 0;
   int m_regsPerSM;
   int m_shmemPerSM       = 0;
@@ -108,6 +111,8 @@ class HIPInternal {
   mutable int m_cycleId = 0;
   // mutex to access d_driverWorkArray
   mutable std::mutex m_mutexWorkArray;
+  // mutex to access shared memory
+  mutable std::mutex m_mutexSharedMemory;
 
   // Scratch Spaces for Reductions
   size_type m_scratchSpaceCount = 0;
@@ -119,7 +124,10 @@ class HIPInternal {
 
   hipDeviceProp_t m_deviceProp;
 
-  hipStream_t m_stream = nullptr;
+  hipStream_t m_stream   = nullptr;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::HIP>(reinterpret_cast<uintptr_t>(this));
+  bool m_manage_stream = false;
 
   // Team Scratch Level 1 Space
   mutable int64_t m_team_scratch_current_size = 0;
@@ -128,18 +136,25 @@ class HIPInternal {
 
   bool was_finalized = false;
 
+  // FIXME_HIP: these want to be per-device, not per-stream...  use of 'static'
+  // here will break once there are multiple devices though
+  static unsigned long *constantMemHostStaging;
+  static hipEvent_t constantMemReusable;
+
   static HIPInternal &singleton();
 
   int verify_is_initialized(const char *const label) const;
 
   int is_initialized() const { return m_hipDev >= 0; }
 
-  void initialize(int hip_device_id, hipStream_t stream = nullptr);
+  void initialize(int hip_device_id, hipStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream &) const;
 
   void fence() const;
+  void fence(const std::string &) const;
 
   // returns the next driver type pointer in our work array
   char *get_next_driver(size_t driverTypeSize) const;
@@ -151,13 +166,52 @@ class HIPInternal {
   // Resizing of reduction related scratch spaces
   size_type *scratch_space(const size_type size);
   size_type *scratch_flags(const size_type size);
-
+  uint32_t impl_get_instance_id() const noexcept;
   // Resizing of team level 1 scratch
   void *resize_team_scratch_space(std::int64_t bytes,
                                   bool force_shrink = false);
 };
 
 }  // namespace Impl
+
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<HIP> partition_space(const HIP &, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  std::vector<HIP> instances(sizeof...(Args));
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<HIP> instances(weights.size());
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index f774423b37..f209edf7c0 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -52,6 +52,7 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Locks.hpp>
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -64,7 +65,7 @@ namespace Kokkos {
 namespace Experimental {
 template <typename T>
 inline __device__ T *kokkos_impl_hip_shared_memory() {
-  HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
+  extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[];
   return (T *)sh;
 }
 }  // namespace Experimental
@@ -74,10 +75,12 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+// The hip_parallel_launch_*_memory code is identical to the cuda code
 template <typename DriverType>
 __global__ static void hip_parallel_launch_constant_memory() {
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
+
   driver();
 }
 
@@ -87,12 +90,13 @@ __global__ __launch_bounds__(
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
 
-  driver->operator()();
+  driver();
 }
 
 template <class DriverType>
 __global__ static void hip_parallel_launch_local_memory(
     const DriverType *driver) {
+  // FIXME_HIP driver() pass by copy
   driver->operator()();
 }
 
@@ -101,6 +105,21 @@ __global__ __launch_bounds__(
     maxTperB,
     minBperSM) static void hip_parallel_launch_local_memory(const DriverType
                                                                 *driver) {
+  // FIXME_HIP driver() pass by copy
+  driver->operator()();
+}
+
+template <typename DriverType>
+__global__ static void hip_parallel_launch_global_memory(
+    const DriverType *driver) {
+  driver->operator()();
+}
+
+template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
+__global__ __launch_bounds__(
+    maxTperB,
+    minBperSM) static void hip_parallel_launch_global_memory(const DriverType
+                                                                 *driver) {
   driver->operator()();
 }
 
@@ -127,33 +146,238 @@ struct HIPDispatchProperties {
   HIPLaunchMechanism launch_mechanism = l;
 };
 
+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+// The following code is identical to the cuda code
+template <typename DriverType>
+struct DeduceHIPLaunchMechanism {
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t
+      heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
+  static constexpr typename DriverType::Policy::work_item_property property =
+      typename DriverType::Policy::work_item_property();
+
+  static constexpr HIPLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::Default) |
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+           ? HIPLaunchMechanism::ConstantMemory
+           : HIPLaunchMechanism::Default) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism requested_launch_mechanism =
+      (((property & light_weight) == light_weight)
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::ConstantMemory) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold)
+          ? HIPLaunchMechanism::LocalMemory
+          : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage)
+                 ? HIPLaunchMechanism::ConstantMemory
+                 : HIPLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG  LCG L  L       LCG  LG L  L   LCG  CG L  C
+  // UseT<F<KAL   LCG  LCG C  C       LCG  LG C  L   LCG  CG C  C
+  // Kal<F<CMU     CG  LCG C  C        CG  LG C  G    CG  CG C  C
+  // CMU<F          G  LCG G  G         G  LG G  G     G  CG G  G
+  static constexpr HIPLaunchMechanism launch_mechanism =
+      ((property & light_weight) == light_weight)
+          ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+                 ? HIPLaunchMechanism::LocalMemory
+                 : HIPLaunchMechanism::GlobalMemory)
+          : (((property & heavy_weight) == heavy_weight)
+                 ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+                        ? HIPLaunchMechanism::ConstantMemory
+                        : HIPLaunchMechanism::GlobalMemory)
+                 : (default_launch_mechanism));
+};
+
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFuncData {
+  static unsigned int get_scratch_size(
+      hipFuncAttributes const &hip_func_attributes) {
+    return hip_func_attributes.localSizeBytes;
+  }
+
+  static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) {
+    static hipFuncAttributes attr = [=]() {
+      hipFuncAttributes attr;
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func));
+      return attr;
+    }();
+    return attr;
+  }
+};
+
+//---------------------------------------------------------------//
+// HIPParallelLaunchKernelFunc structure and its specializations //
+//---------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelFunc;
 
+// HIPLaunchMechanism::LocalMemory specializations
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
           unsigned int MinBlocksPerSM>
 struct HIPParallelLaunchKernelFunc<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
     HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
     return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
                                             MinBlocksPerSM>;
   }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
 };
 
 template <typename DriverType>
 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
                                    HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
-    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
+    return HIPParallelLaunchKernelFunc<
+        DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+        HIPLaunchMechanism::LocalMemory>::get_kernel_func();
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType>;
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType>;
+  }
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+//------------------------------------------------------------------//
+// HIPParallelLaunchKernelInvoker structure and its specializations //
+//------------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelInvoker;
 
+// HIPLaunchMechanism::LocalMemory specialization
 template <typename DriverType, typename LaunchBounds>
 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
                                       HIPLaunchMechanism::LocalMemory>
@@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specialization
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::GlobalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::GlobalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::GlobalMemory>;
+
+  // FIXME_HIP the code is different than cuda because driver cannot be passed
+  // by copy
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::ConstantMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory> {
+  using base_t =
+      HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory>;
+  static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage,
+                "Kokkos Error: Requested HIPLaunchConstantMemory with a "
+                "Functor larger than 32kB.");
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    // Wait until the previous kernel that uses the constant buffer is done
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipEventSynchronize(hip_instance->constantMemReusable));
+
+    // Copy functor (synchronously) to staging buffer in pinned host memory
+    unsigned long *staging = hip_instance->constantMemHostStaging;
+    std::memcpy((void *)staging, (void *)driver, sizeof(DriverType));
+
+    // Copy functor asynchronously from there to constant memory on the device
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(
+        HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging,
+        sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream));
+
+    // Invoke the driver function on the device
+    (base_t::
+         get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>();
+
+    // Record an event that says when the constant buffer can be reused
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable,
+                                             hip_instance->m_stream));
+  }
+};
+
+//-----------------------------//
+// HIPParallelLaunch structure //
+//-----------------------------//
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
 struct HIPParallelLaunch;
 
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
+          unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunch<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    HIPLaunchMechanism::LocalMemory>
+    LaunchMechanism>
     : HIPParallelLaunchKernelInvoker<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          HIPLaunchMechanism::LocalMemory> {
+          LaunchMechanism> {
   using base_t = HIPParallelLaunchKernelInvoker<
       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-      HIPLaunchMechanism::LocalMemory>;
+      LaunchMechanism>;
 
   HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
                     const dim3 &block, const int shmem,
@@ -205,22 +491,48 @@ struct HIPParallelLaunch<
       base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError());
+      hip_instance->fence(
+          "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for "
+          "Execution Error");
 #endif
     }
   }
-
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
-      return attr;
-    }();
-    return attr;
-  }
 };
+
+// convenience method to launch the correct kernel given the launch bounds et
+// al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
+                         const dim3 &block, const int shmem,
+                         const HIPInternal *hip_instance,
+                         const bool prefer_shmem) {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+        driver, grid, block, shmem, hip_instance, prefer_shmem);
+  } else {
+    // we can do what we like
+    const unsigned flat_block_size = block.x * block.y * block.z;
+    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+      // we have to use the large blocksize
+      HIPParallelLaunch<
+          DriverType,
+          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                           prefer_shmem);
+    } else {
+      HIPParallelLaunch<DriverType,
+                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+                        LaunchMechanism>(driver, grid, block, shmem,
+                                         hip_instance, prefer_shmem);
+    }
+  }
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index 4f5271b6f6..c4292d35ec 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -84,11 +84,17 @@ namespace Impl {
 HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+#endif
+
   if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.atomic,
       sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.scratch,
       sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency())));
 
@@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() {
 }
 
 void finalize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
   g_host_hip_lock_arrays.atomic = nullptr;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
   g_host_hip_lock_arrays.scratch = nullptr;
   g_host_hip_lock_arrays.n       = 0;
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
index f34f85f43b..71b104c2e4 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
@@ -51,6 +51,10 @@
 
 #include <HIP/Kokkos_HIP_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_HIP.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
   {                                                             \
     if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      HIP_SAFE_CALL(hipMemcpyToSymbol(                          \
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
           HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
           &::Kokkos::Impl::g_host_hip_lock_arrays,              \
           sizeof(::Kokkos::Impl::HIPLockArrays)));              \
@@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     ::Kokkos::Impl::lock_array_copied = 1;                      \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( __HIPCC__ ) */
 
 #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
index ce1aff9586..acb538e1cb 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
       space.impl_internal_space_instance()->m_maxThreadsPerSM;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 4;
-  properties.max_total_tile_size       = 1024;
+  properties.max_total_tile_size =
+      Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
   return properties;
 }
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 35e7d6fb85..eae323dd91 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -81,6 +81,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   inline void execute() const {
+    using ClosureType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     if (m_policy.m_num_tiles == 0) return;
     array_index_type const maxblocks = static_cast<array_index_type>(
         m_policy.space().impl_internal_space_instance()->m_maxBlock);
@@ -94,7 +96,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        block.y,
                    maxblocks),
           1);
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 3) {
@@ -110,7 +113,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 4) {
@@ -128,7 +132,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 5) {
@@ -147,7 +152,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 6) {
@@ -165,7 +171,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       std::min(static_cast<index_type>(m_policy.m_tile_end[4] *
                                                        m_policy.m_tile_end[5]),
                                static_cast<index_type>(maxblocks)));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else {
@@ -178,22 +185,18 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                     Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
   }
 };
 
@@ -242,6 +245,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_device_accessible;
   size_type* m_scratch_space;
   size_type* m_scratch_flags;
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, WorkTag, reference_type>;
@@ -307,32 +313,30 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   // Determine block size constrained by shared memory:
   // This is copy/paste from Kokkos_HIP_Parallel_Range
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-        false, FunctorType, WorkTag>(f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-          false, FunctorType, WorkTag>(f, n);
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<closure_type,
+                                                                LaunchBounds>(
+            instance, shmem_functor);
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
     }
-    return n;
+    return block_size;
   }
 
   inline void execute() {
-    const int nwork = m_policy.m_num_tiles;
+    using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                       Kokkos::Experimental::HIP>;
+    const int nwork   = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
@@ -366,14 +370,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
               false, FunctorType, WorkTag>(m_functor, block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -403,7 +409,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
                               typename ViewType::memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -416,23 +425,25 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                               typename ReducerType::result_view_type::
                                   memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
+
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        ReducerType, Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
+    }
+    return block_size;
   }
 };
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 7d2825eeb4..e02ead1e99 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -108,16 +108,21 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   inline void execute() const {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
+    using DriverType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     const int block_size =
-        LaunchBounds::maxTperB
-            ? LaunchBounds::maxTperB
-            : ::Kokkos::Experimental::Impl::HIPTraits::
-                  MaxThreadsPerBlock;  // FIXME_HIP Choose block_size better
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<DriverType,
+                                                                LaunchBounds>();
     const dim3 block(1, block_size, 1);
     const dim3 grid(
         typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
 
-    Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
         *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
         false);
   }
@@ -173,15 +178,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_host_accessible;
   size_type* m_scratch_space = nullptr;
   size_type* m_scratch_flags = nullptr;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
-#if HIP_VERSION < 401
-  static bool constexpr UseShflReduction =
-      ((sizeof(value_type) > 2 * sizeof(double)) &&
-       static_cast<bool>(ValueTraits::StaticValueSize));
-#else
   static bool constexpr UseShflReduction =
       static_cast<bool>(ValueTraits::StaticValueSize);
-#endif
 
  private:
   struct ShflReductionTag {};
@@ -328,30 +330,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size =
-        hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned int>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size =
-          hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-              f, n);
-    }
-    return n;
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                      Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, LaunchBounds>(instance, shmem_functor);
   }
 
   inline void execute() {
@@ -362,7 +349,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
       const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
@@ -391,14 +382,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                          WorkTag>(m_functor,
                                                                   block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
+        m_policy.space().impl_internal_space_instance()->fence(
+            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -429,7 +423,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                               typename ViewType::memory_space>::accessible),
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -444,7 +441,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
                               typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
+                                  memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
@@ -482,6 +482,9 @@ class ParallelScanHIPBase {
   size_type* m_scratch_flags = nullptr;
   size_type m_final          = false;
   int m_grid_x               = 0;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
  private:
   template <class TagType>
@@ -624,22 +627,7 @@ class ParallelScanHIPBase {
   }
 
   // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
-    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // TODO check best option
-
-    unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4;
-    while (n && static_cast<unsigned>(m_policy.space()
-                                          .impl_internal_space_instance()
-                                          ->m_maxShmemPerBlock) <
-                    hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
+  virtual inline unsigned local_block_size(const FunctorType& f) = 0;
 
   inline void impl_execute() {
     const index_type nwork = m_policy.end() - m_policy.begin();
@@ -649,7 +637,11 @@ class ParallelScanHIPBase {
       const int gridMaxComputeCapability_2x = 0x01fff;
 
       const int block_size = static_cast<int>(local_block_size(m_functor));
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       const int grid_max =
           std::min(block_size * block_size, gridMaxComputeCapability_2x);
@@ -674,15 +666,16 @@ class ParallelScanHIPBase {
       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
 
       m_final = false;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      // these ones are OK to be just the base because the specializations
+      // do not modify the kernel at all
+      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       m_final = true;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
@@ -690,13 +683,17 @@ class ParallelScanHIPBase {
   }
 
   ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                    Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -706,6 +703,23 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScan(const FunctorType& arg_functor,
                const typename Base::Policy& arg_policy)
       : Base(arg_functor, arg_policy) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType = ParallelScan<FunctorType, typename Base::Policy,
+                                    Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -713,7 +727,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                             ReturnType, Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -737,6 +751,24 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                         const typename Base::Policy& arg_policy,
                         ReturnType& arg_returnvalue)
       : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType =
+        ParallelScanWithTotal<FunctorType, typename Base::Policy, ReturnType,
+                              Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 96c3ff2a75..b794f5bc03 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -56,20 +56,20 @@
 
 namespace Kokkos {
 namespace Impl {
+
 template <typename... Properties>
 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     : public PolicyTraits<Properties...> {
  public:
   using execution_policy = TeamPolicyInternal;
 
-  using traits = PolicyTraits<Properties...>;
+  using traits    = PolicyTraits<Properties...>;
+  using BlockType = Kokkos::Experimental::Impl::BlockType;
 
   template <typename ExecSpace, typename... OtherProperties>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -101,17 +101,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type>(f);
   }
 
   template <class FunctorType>
@@ -129,8 +121,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_max<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
     using closure_type =
         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
@@ -141,17 +133,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
   }
 
   template <typename FunctorType>
@@ -169,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_recommended<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
+  template <typename FunctorType, typename ReducerType>
   int team_size_recommended(FunctorType const& f, ReducerType const&,
                             ParallelReduceTag const&) const {
     using closure_type =
@@ -177,6 +161,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
                              ReducerType>;
     return internal_team_size_recommended<closure_type>(f);
   }
+
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   static int vector_length_max() {
@@ -211,7 +196,10 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -266,7 +254,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
           "space.");
 
     // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > 1024) {
+    if (m_team_size * m_vector_length >
+        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) {
       Impl::throw_runtime_exception(
           std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
                       "Team size x vector length must be smaller than 1024."));
@@ -363,26 +352,84 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   using member_type = Kokkos::Impl::HIPTeamMember;
 
  protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common(const FunctorType& f) const {
+    // FIXME_HIP: this could be unified with the
+    // internal_team_size_common_reduce
+    //            once we can turn c++17 constexpr on by default.
+    //            The problem right now is that we can't turn off the evaluation
+    //            of the functor_value_traits's valuesize / StaticValueSize
+
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
+    const int vector_length     = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "team size."));
+    }
+    return block_size / impl_vector_length();
+  }
+
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common_reduce(const FunctorType& f) const {
     using functor_value_traits =
         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
 
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
+                                  ((functor_value_traits::StaticValueSize != 0)
+                                       ? 0
+                                       : functor_value_traits::value_size(f));
+    const int vector_length = impl_vector_length();
 
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid team size."));
+    }
     // Currently we require Power-of-2 team size for reductions.
     int p2 = 1;
     while (p2 <= block_size) p2 *= 2;
@@ -392,16 +439,13 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Max, ClosureType>(f);
   }
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Preferred, ClosureType>(
+        f);
   }
 };
 
@@ -505,7 +549,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     dim3 const block(static_cast<int>(m_vector_size),
                      static_cast<int>(m_team_size), 1);
 
-    ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                    Kokkos::Experimental::HIP>;
+    ::Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
         *this, grid, block, shmem_size_total,
         m_policy.space().impl_internal_space_instance(),
         true);  // copy to device and execute
@@ -520,17 +568,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelFor, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-                  FunctorType, launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
 
     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
     m_shmem_size =
@@ -556,23 +596,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     int const shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
         shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        static_cast<int>(
-            ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                 launch_bounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
     }
@@ -839,8 +868,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    launch_bounds>(
+      using closure_type =
+          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                         ReducerType, Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
           *this, grid, block, shmem_size_total,
           m_policy.space().impl_internal_space_instance(),
           true);  // copy to device and execute
@@ -890,17 +922,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction
@@ -958,8 +982,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                       "L0 scratch memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
@@ -992,18 +1017,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
+    m_team_size = m_team_size >= 0
+                      ? m_team_size
+                      : arg_policy.team_size_recommended(arg_functor, reducer,
+                                                         ParallelReduceTag());
     m_team_begin =
         UseShflReduction
             ? 0
@@ -1046,7 +1063,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // upon team size.
 
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
          !UseShflReduction) ||
         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -1054,8 +1070,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
     }
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 15ca089d14..e25ebe2ab3 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -67,102 +67,32 @@ namespace {
 hipStream_t get_deep_copy_stream() {
   static hipStream_t s = nullptr;
   if (s == nullptr) {
-    HIP_SAFE_CALL(hipStreamCreate(&s));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s));
   }
   return s;
 }
 }  // namespace
 
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
+void DeepCopyHIP(void* dst, void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
 }
 
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
 }
 
 void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
   hipStream_t s = get_deep_copy_stream();
-  HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
-  HIP_SAFE_CALL(hipStreamSynchronize(s));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy "
+      "stream",
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); });
 }
 
 }  // namespace Impl
@@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 
 KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
@@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
 }
 
 }  // namespace Kokkos
+#endif
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
 }
 
 void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr,
@@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }
 
 }  // namespace Experimental
@@ -427,23 +359,42 @@ HIP::HIP()
       "HIP instance constructor");
 }
 
-HIP::HIP(hipStream_t const stream)
+HIP::HIP(hipStream_t const stream, bool manage_stream)
     : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
+  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream,
+                               manage_stream);
 }
 
 void HIP::print_configuration(std::ostream& s, const bool) {
   Impl::HIPInternal::singleton().print_configuration(s);
 }
 
-void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+uint32_t HIP::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+void HIP::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
+void HIP::impl_static_fence() {
+  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
+}
 
-void HIP::fence() const { m_space_instance->fence(); }
+void HIP::fence(const std::string& name) const {
+  m_space_instance->fence(name);
+}
+void HIP::fence() const {
+  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
+}
 
 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }
 
@@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) {
 void HIPSpaceInitializer::fence() {
   Kokkos::Experimental::HIP::impl_static_fence();
 }
+void HIPSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::HIP::impl_static_fence(name);
+}
 
 void HIPSpaceInitializer::print_configuration(std::ostream& msg,
                                               const bool detail) {
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index fe52886ced..fb67a25c5e 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -316,198 +316,6 @@ class HIPTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __HIP_DEVICE_COMPILE__
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of HIP threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >>
-                   Experimental::Impl::HIPTraits::WarpIndexShift;
-    const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask;
-    const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing HIP threads
-
-      value_type tmp(reducer.reference());
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) {
-        Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                               warp_size);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        (reinterpret_cast<value_type*>(shmem))[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                         (reinterpret_cast<pointer_type>(shmem)) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = static_cast<int>(gridDim.x) ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      // FIXME hip does not support __syncthreads_or so we need to do it by hand
-      // last_block = __syncthreads_or(last_block);
-
-      __shared__ int last_block_shared;
-      if (last_block) last_block_shared = last_block;
-      __threadfence_block();
-
-      if (!last_block_shared) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(
-          (reinterpret_cast<pointer_type>(shmem)) + offset,
-          (reinterpret_cast<pointer_type>(global_scratch_space)) + offset);
-
-      for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) {
-        reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset,
-                     (reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      const int nreduce = warp_size < nentry ? warp_size : nentry;
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                       (reinterpret_cast<pointer_type>(shmem)) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem));
-        return 1;
-      }
-    }
-    return 0;
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 910d5e52e6..d9cb66e11f 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); }
+void HPXSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Experimental::HPX().fence(name);
+}
 
 void HPXSpaceInitializer::print_configuration(std::ostream &msg,
                                               const bool detail) {
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index df09e026fd..7bb3ca5d00 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -82,7 +82,9 @@ class TaskQueueSpecialization<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
+        "after task execution");
   }
 
   // Must provide task queue execution function
@@ -214,7 +216,7 @@ class TaskQueueSpecializationConstrained<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence()"Kokkos::Impl::TaskQueueSpecializationConstrained::execute: fence after task execution";
   }
 
   // Must provide task queue execution function
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index 527fe12ad9..d7e13e28f0 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -79,7 +79,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
  public:
   void execute() const {
     dispatch_execute_task(this, m_policy.space());
-    m_policy.space().fence();
+    m_policy.space().fence(
+        "Kokkos::Experimental::Impl::HPX::ParallelFor<WorkGraphPolicy>: fence "
+        "after kernel execution");
   }
 
   void execute_task() const {
diff --git a/lib/kokkos/core/src/KokkosExp_InterOp.hpp b/lib/kokkos/core/src/KokkosExp_InterOp.hpp
new file mode 100644
index 0000000000..37c2088f88
--- /dev/null
+++ b/lib/kokkos/core/src/KokkosExp_InterOp.hpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_INTEROP_HPP
+#define KOKKOS_CORE_EXP_INTEROP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_View.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// ------------------------------------------------------------------ //
+//  this is used to convert
+//      Kokkos::Device<ExecSpace, MemSpace> to MemSpace
+//
+template <typename Tp>
+struct device_memory_space {
+  using type = Tp;
+};
+
+template <typename ExecT, typename MemT>
+struct device_memory_space<Kokkos::Device<ExecT, MemT>> {
+  using type = MemT;
+};
+
+template <typename Tp>
+using device_memory_space_t = typename device_memory_space<Tp>::type;
+
+// ------------------------------------------------------------------ //
+//  this is the impl version which takes a view and converts to python
+//  view type
+//
+template <typename, typename...>
+struct python_view_type_impl;
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT>, type_list<Types...>> {
+  using type = ViewT<ValueT, device_memory_space_t<Types>...>;
+};
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT, Types...>>
+    : python_view_type_impl<ViewT<ValueT>,
+                            filter_type_list_t<is_default_memory_trait,
+                                               type_list<Types...>, false>> {};
+
+template <typename... T>
+using python_view_type_impl_t = typename python_view_type_impl<T...>::type;
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <typename DataType, class... Properties>
+class DynRankView;
+
+namespace Impl {
+
+// Duplicate from the header file for DynRankView to avoid core depending on
+// containers.
+template <class>
+struct is_dyn_rank_view_dup : public std::false_type {};
+
+template <class D, class... P>
+struct is_dyn_rank_view_dup<Kokkos::DynRankView<D, P...>>
+    : public std::true_type {};
+
+}  // namespace Impl
+
+namespace Experimental {
+
+// ------------------------------------------------------------------ //
+//  this is used to extract the uniform type of a view
+//
+template <typename ViewT>
+struct python_view_type {
+  static_assert(
+      Kokkos::is_view<std::decay_t<ViewT>>::value ||
+          Kokkos::Impl::is_dyn_rank_view_dup<std::decay_t<ViewT>>::value,
+      "Error! python_view_type only supports Kokkos::View and "
+      "Kokkos::DynRankView");
+
+  using type =
+      Kokkos::Impl::python_view_type_impl_t<typename ViewT::array_type>;
+};
+
+template <typename ViewT>
+using python_view_type_t = typename python_view_type<ViewT>::type;
+
+template <typename Tp>
+auto as_python_type(Tp&& _v) {
+  using cast_type = python_view_type_t<Tp>;
+  return static_cast<cast_type>(std::forward<Tp>(_v));
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index b7d8e62f69..dfae7451fc 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -48,6 +48,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Layout.hpp>
+#include <Kokkos_Rank.hpp>
 #include <Kokkos_Array.hpp>
 #include <impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
@@ -78,22 +79,6 @@ struct default_inner_direction {
   static constexpr Iterate value = Iterate::Right;
 };
 
-// Iteration Pattern
-template <unsigned N, Iterate OuterDir = Iterate::Default,
-          Iterate InnerDir = Iterate::Default>
-struct Rank {
-  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
-  static_assert(N != 1u,
-                "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
-
-  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
-
-  static constexpr int rank                = N;
-  static constexpr Iterate outer_direction = OuterDir;
-  static constexpr Iterate inner_direction = InnerDir;
-};
-
 namespace Impl {
 // NOTE the comparison below is encapsulated to silent warnings about pointless
 // comparison of unsigned integer with zero
@@ -397,13 +382,18 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 namespace Kokkos {
 namespace Experimental {
-using Kokkos::Iterate;
-using Kokkos::MDRangePolicy;
-using Kokkos::Rank;
+using Iterate KOKKOS_DEPRECATED = Kokkos::Iterate;
+template <typename... Properties>
+using MDRangePolicy KOKKOS_DEPRECATED = Kokkos::MDRangePolicy<Properties...>;
+template <unsigned N, Kokkos::Iterate OuterDir = Kokkos::Iterate::Default,
+          Kokkos::Iterate InnerDir = Kokkos::Iterate::Default>
+using Rank KOKKOS_DEPRECATED = Kokkos::Rank<N, OuterDir, InnerDir>;
 }  // namespace Experimental
 }  // namespace Kokkos
+#endif
 
 #endif  // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
index 8cd60fa6ba..a47208e977 100644
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -69,6 +69,60 @@
 #define KOKKOS_ATOMIC_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+#define DESUL_HAVE_OPENMP_ATOMICS
+#endif
+#include <Kokkos_Atomics_Desul_Wrapper.hpp>
+#include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally Non-Desul
+// versions are below
+namespace Kokkos {
+namespace Impl {
+using desul::MemoryOrderSeqCst;
+using desul::MemoryScopeDevice;
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_dec(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_inc(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, const Kokkos::Impl::identity_t<T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return desul::atomic_exchange(const_cast<T*>(dest), val,
+                                desul::MemoryOrderSeqCst(),
+                                desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return desul::atomic_compare_exchange(dest, compare, val,
+                                        desul::MemoryOrderSeqCst(),
+                                        desul::MemoryScopeDevice());
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+#else
+
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Traits.hpp>
 
@@ -326,4 +380,42 @@ inline const char* atomic_query_version() {
 
 //----------------------------------------------------------------------------
 
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally
+namespace Kokkos {
+namespace Impl {
+struct MemoryOrderSeqCst {};
+struct MemoryScopeDevice {};
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_decrement(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_increment(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, Kokkos::Impl::identity_t<const T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return Kokkos::atomic_exchange(dest, val);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return Kokkos::atomic_compare_exchange(dest, compare, val);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
 #endif /* KOKKOS_ATOMIC_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
new file mode 100644
index 0000000000..0bcb3ea388
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -0,0 +1,189 @@
+#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+// clang-format off
+namespace Kokkos { 
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast<T*>(dest), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),val);
+  #else
+  return desul::atomic_fetch_add (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),-val);
+  #else
+  return desul::atomic_fetch_sub (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) {
+  return desul::atomic_compare_exchange_strong(const_cast<T*>(dest),expected, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) {
+  return desul::atomic_compare_exchange(const_cast<T*>(dest),compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
new file mode 100644
index 0000000000..3a182a6a22
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -0,0 +1,271 @@
+#ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+#include <impl/Kokkos_Atomic_Memory_Order.hpp>
+#include <impl/Kokkos_Volatile_Load.hpp>
+
+// clang-format off
+namespace Kokkos {
+
+// FIXME: These functions don't have any use/test in unit tests ...
+// ==========================================================
+inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; }
+
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \
+    !defined(__CUDA_ARCH__)
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0)
+
+#else
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+// ============================================================
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { atomic_store(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+void memory_fence() {
+  desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
+}
+
+KOKKOS_INLINE_FUNCTION
+void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); }
+
+KOKKOS_INLINE_FUNCTION
+void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,val);
+  #else
+  return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,-val);
+  #else
+  return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val)  { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> expected, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  T expected_ref = expected;
+  return desul::atomic_compare_exchange_strong(dest, expected_ref, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> compare, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  return desul::atomic_compare_exchange(dest, compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+namespace Impl {
+
+  template<class MemoryOrder>
+  struct KokkosToDesulMemoryOrder;
+
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_seq_cst_t> {
+    using type = desul::MemoryOrderSeqCst;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acquire_t> {
+    using type = desul::MemoryOrderAcquire;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_release_t> {
+    using type = desul::MemoryOrderRelease;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acq_rel_t> {
+    using type = desul::MemoryOrderAcqRel;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_relaxed_t> {
+    using type = desul::MemoryOrderRelaxed;
+  };
+  template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION
+  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) {
+    return desul::atomic_compare_exchange_strong(dest, expected, desired,
+                  typename KokkosToDesulMemoryOrder<MemOrderSuccess>::type(),
+                  typename KokkosToDesulMemoryOrder<MemOrderFailure>::type(),
+                  desul::MemoryScopeDevice());
+
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_load(const T* const src, MemoryOrder) {
+    return desul::atomic_load(src, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  void atomic_store(T* const src, const T val, MemoryOrder) {
+    return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 6578723fc8..466903ab7d 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -77,7 +77,7 @@ class
 
   //! Default constructor (initializes both real and imaginary parts to zero).
   KOKKOS_DEFAULTED_FUNCTION
-  complex() noexcept = default;
+  complex() = default;
 
   //! Copy constructor.
   KOKKOS_DEFAULTED_FUNCTION
@@ -150,11 +150,11 @@ class
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& imag() noexcept { return im_; }
+  constexpr RealType& imag() noexcept { return im_; }
 
   //! The real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& real() noexcept { return re_; }
+  constexpr RealType& real() noexcept { return re_; }
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
@@ -166,41 +166,39 @@ class
 
   //! Set the imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void imag(RealType v) noexcept { im_ = v; }
+  constexpr void imag(RealType v) noexcept { im_ = v; }
 
   //! Set the real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void real(RealType v) noexcept { re_ = v; }
+  constexpr void real(RealType v) noexcept { re_ = v; }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const complex<RealType>& src) noexcept {
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const RealType& src) noexcept {
     re_ += src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const complex<RealType>& src) noexcept {
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const RealType& src) noexcept {
     re_ -= src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const complex<RealType>& src) noexcept {
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
@@ -209,7 +207,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const RealType& src) noexcept {
     re_ *= src;
     im_ *= src;
@@ -217,7 +215,7 @@ class
   }
 
   // Conditional noexcept, just in case RType throws on divide-by-zero
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
     using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
@@ -244,8 +242,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14
-  KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
     using Kokkos::Experimental::fabs;
@@ -272,7 +269,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const RealType& src) noexcept(noexcept(RealType{} / RealType{})) {
     re_ /= src;
     im_ /= src;
@@ -688,12 +685,24 @@ KOKKOS_INLINE_FUNCTION RealType imag(const complex<RealType>& x) noexcept {
   return x.imag();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> imag(
+    ArithmeticType) {
+  return ArithmeticType();
+}
+
 //! Real part of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept {
   return x.real();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> real(
+    ArithmeticType x) {
+  return x;
+}
+
 //! Constructs a complex number from magnitude and phase angle
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
@@ -733,36 +742,6 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
   return x == T() ? T() : exp(y * log(x));
 }
 
-namespace Impl {
-// NOTE promote would also be useful for math functions
-template <class T, bool = std::is_integral<T>::value>
-struct promote {
-  using type = double;
-};
-template <class T>
-struct promote<T, false> {};
-template <>
-struct promote<long double> {
-  using type = long double;
-};
-template <>
-struct promote<double> {
-  using type = double;
-};
-template <>
-struct promote<float> {
-  using type = float;
-};
-template <class T>
-using promote_t = typename promote<T>::type;
-template <class T, class U>
-struct promote_2 {
-  using type = decltype(promote_t<T>() + promote_t<U>());
-};
-template <class T, class U>
-using promote_2_t = typename promote_2<T, U>::type;
-}  // namespace Impl
-
 template <class T, class U,
           class = std::enable_if_t<std::is_arithmetic<T>::value>>
 KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
@@ -816,6 +795,13 @@ KOKKOS_INLINE_FUNCTION complex<RealType> conj(
   return complex<RealType>(real(x), -imag(x));
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr complex<Impl::promote_t<ArithmeticType>> conj(
+    ArithmeticType x) {
+  using type = Impl::promote_t<ArithmeticType>;
+  return complex<type>(x, -type());
+}
+
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 2aba189487..97137387f2 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -180,20 +180,23 @@ KOKKOS_IMPL_IS_CONCEPT(work_item_property)
 
 namespace Impl {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility:
 
-using Kokkos::is_array_layout;
-using Kokkos::is_execution_policy;
-using Kokkos::is_execution_space;
-using Kokkos::is_memory_space;
-using Kokkos::is_memory_traits;
+template <typename T>
+using is_array_layout KOKKOS_DEPRECATED = Kokkos::is_array_layout<T>;
+template <typename T>
+using is_execution_policy KOKKOS_DEPRECATED = Kokkos::is_execution_policy<T>;
+template <typename T>
+using is_execution_space KOKKOS_DEPRECATED = Kokkos::is_execution_space<T>;
+template <typename T>
+using is_memory_space KOKKOS_DEPRECATED = Kokkos::is_memory_space<T>;
+template <typename T>
+using is_memory_traits KOKKOS_DEPRECATED = Kokkos::is_memory_traits<T>;
+#endif
 
 // Implementation concept:
 
-KOKKOS_IMPL_IS_CONCEPT(iteration_pattern)
-KOKKOS_IMPL_IS_CONCEPT(schedule_type)
-KOKKOS_IMPL_IS_CONCEPT(index_type)
-KOKKOS_IMPL_IS_CONCEPT(launch_bounds)
 KOKKOS_IMPL_IS_CONCEPT(thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(graph_kernel)
@@ -330,42 +333,65 @@ struct is_space {
   // For backward compatibility, deprecated in favor of
   // Kokkos::Impl::HostMirror<S>::host_mirror_space
 
-  using host_memory_space = typename std::conditional<
+ private:
+  // The actual definitions for host_memory_space and host_execution_spaces are
+  // in do_not_use_host_memory_space and do_not_use_host_execution_space to be
+  // able to use them within this class without deprecation warnings.
+  using do_not_use_host_memory_space = std::conditional_t<
       std::is_same<memory_space, Kokkos::HostSpace>::value
 #if defined(KOKKOS_ENABLE_CUDA)
           || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
           std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#elif defined(KOKKOS_ENABLE_HIP)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::HIPHostPinnedSpace>::value
+#elif defined(KOKKOS_ENABLE_SYCL)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::SYCLSharedUSMSpace>::value ||
+          std::is_same<memory_space,
+                       Kokkos::Experimental::SYCLHostUSMSpace>::value
+#endif
       ,
-      memory_space, Kokkos::HostSpace>::type;
+      memory_space, Kokkos::HostSpace>;
 
+  using do_not_use_host_execution_space = std::conditional_t<
 #if defined(KOKKOS_ENABLE_CUDA)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Cuda>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Experimental::OpenMPTarget>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-  using host_execution_space = execution_space;
-#endif
+      std::is_same<execution_space, Kokkos::Cuda>::value ||
+#elif defined(KOKKOS_ENABLE_HIP)
+      std::is_same<execution_space, Kokkos::Experimental::HIP>::value ||
+#elif defined(KOKKOS_ENABLE_SYCL)
+      std::is_same<execution_space, Kokkos::Experimental::SYCL>::value ||
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+      std::is_same<execution_space,
+                   Kokkos::Experimental::OpenMPTarget>::value ||
 #endif
+          false,
+      Kokkos::DefaultHostExecutionSpace, execution_space>;
 
-  using host_mirror_space = typename std::conditional<
-      std::is_same<execution_space, host_execution_space>::value &&
-          std::is_same<memory_space, host_memory_space>::value,
-      T, Kokkos::Device<host_execution_space, host_memory_space>>::type;
+ public:
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using host_memory_space KOKKOS_DEPRECATED = do_not_use_host_memory_space;
+  using host_execution_space KOKKOS_DEPRECATED =
+      do_not_use_host_execution_space;
+  using host_mirror_space KOKKOS_DEPRECATED = std::conditional_t<
+      std::is_same<execution_space, do_not_use_host_execution_space>::value &&
+          std::is_same<memory_space, do_not_use_host_memory_space>::value,
+      T,
+      Kokkos::Device<do_not_use_host_execution_space,
+                     do_not_use_host_memory_space>>;
+#endif
 };
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 
 namespace Impl {
 
-using Kokkos::is_space;
+template <typename T>
+using is_space KOKKOS_DEPRECATED = Kokkos::is_space<T>;
 
 }
+#endif
 
 }  // namespace Kokkos
 
@@ -485,13 +511,18 @@ struct SpaceAccessibility {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
-using Kokkos::SpaceAccessibility;  // For backward compatibility
+// For backward compatibility
+template <typename AccessSpace, typename MemorySpace>
+using SpaceAccessibility KOKKOS_DEPRECATED =
+    Kokkos::SpaceAccessibility<AccessSpace, MemorySpace>;
 
-}
+}  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
index a27d5f0e47..a68a3ea75f 100644
--- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -47,6 +47,7 @@
 #include <string>
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <Kokkos_Layout.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -544,13 +545,11 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
 
   enum {
     ExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, src_memory_space>::accessible
   };
   enum {
     ExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, dst_memory_space>::accessible
   };
 
   if (!(ExecCanAccessSrc && ExecCanAccessDst)) {
@@ -624,14 +623,14 @@ void view_copy(const DstType& dst, const SrcType& src) {
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) {
@@ -1254,6 +1253,98 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
   }
 };
 
+template <typename ExecutionSpace, class DT, class... DP>
+inline void contiguous_fill(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType     = View<DT, DP...>;
+  using ViewTypeFlat = Kokkos::View<
+      typename ViewType::value_type*, Kokkos::LayoutRight,
+      Kokkos::Device<typename ViewType::execution_space,
+                     typename std::conditional<ViewType::Rank == 0,
+                                               typename ViewType::memory_space,
+                                               Kokkos::AnonymousSpace>::type>,
+      Kokkos::MemoryTraits<0>>;
+
+  ViewTypeFlat dst_flat(dst.data(), dst.size());
+  if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int>(dst_flat, value,
+                                                    exec_space);
+  } else
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int64_t>(dst_flat, value,
+                                                        exec_space);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset {
+  ZeroMemset(const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(exec_space, dst, value);
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(ExecutionSpace(), dst, value);
+  }
+};
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
+  else
+    contiguous_fill(exec_space, dst, value);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  contiguous_fill(exec_space, dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<exec_space_type, DT, DP...>(dst, value);
+  else
+    contiguous_fill(exec_space_type(), dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  contiguous_fill(exec_space_type(), dst, value);
+}
 }  // namespace Impl
 
 /** \brief  Deep copy a value from Host memory into a view.  */
@@ -1276,38 +1367,23 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar copy, fence because destination is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence");
   static_assert(std::is_same<typename ViewType::non_const_value_type,
                              typename ViewType::value_type>::value,
                 "deep_copy requires non-const type");
 
-  // If contiguous we can simply do a 1D flat loop
+  // If contiguous we can simply do a 1D flat loop or use memset
   if (dst.span_is_contiguous()) {
-    using ViewTypeFlat = Kokkos::View<
-        typename ViewType::value_type*, Kokkos::LayoutRight,
-        Kokkos::Device<typename ViewType::execution_space,
-                       typename std::conditional<
-                           ViewType::Rank == 0, typename ViewType::memory_space,
-                           Kokkos::AnonymousSpace>::type>,
-        Kokkos::MemoryTraits<0>>;
-
-    ViewTypeFlat dst_flat(dst.data(), dst.size());
-    if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int>(dst_flat, value,
-                                                      exec_space_type());
-    } else
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int64_t>(dst_flat, value,
-                                                          exec_space_type());
-    Kokkos::fence();
+    Impl::contiguous_fill_or_memset(dst, value);
+    Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1362,7 +1438,7 @@ inline void deep_copy(
                              exec_space_type, ViewType::Rank, int>(
           dst, value, exec_space_type());
   }
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
 
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1393,7 +1469,7 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: copy into scalar, src is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1439,18 +1515,19 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar to scalar copy, both pointers null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, pre copy fence");
   if (dst.data() != src.data()) {
     Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
         dst.data(), src.data(), sizeof(value_type));
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1522,7 +1599,9 @@ inline void deep_copy(
 
       Kokkos::Impl::throw_runtime_exception(message);
     }
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to null "
+        "argument");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1531,14 +1610,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Checking for Overlapping Views.
@@ -1549,7 +1628,9 @@ inline void deep_copy(
   if (((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) &&
       ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) &&
       (dst.span_is_contiguous() && src.span_is_contiguous())) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to same "
+        "spans");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1620,16 +1701,22 @@ inline void deep_copy(
       ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
       ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
     const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre view equality "
+        "check");
     if ((void*)dst.data() != (void*)src.data()) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::deep_copy: copy between contiguous views, post deep copy "
+          "fence");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre copy fence");
     Impl::view_copy(dst, src);
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2031,7 +2118,10 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const TeamType& team, const View<DT, DP...>& dst,
-    typename ViewTraits<DT, DP...>::const_value_type& value) {
+    typename ViewTraits<DT, DP...>::const_value_type& value,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
+        nullptr) {
   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, dst.span()),
                        [&](const int& i) { dst.data()[i] = value; });
 }
@@ -2039,7 +2129,10 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
 template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const View<DT, DP...>& dst,
-    typename ViewTraits<DT, DP...>::const_value_type& value) {
+    typename ViewTraits<DT, DP...>::const_value_type& value,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
+        nullptr) {
   for (size_t i = 0; i < dst.span(); ++i) {
     dst.data()[i] = value;
   }
@@ -2418,9 +2511,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2437,7 +2530,9 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence("Kokkos::deep_copy: scalar copy on space, dst data is null");
+  } else if (dst.span_is_contiguous()) {
+    Impl::contiguous_fill_or_memset(space, dst, value);
   } else {
     using ViewTypeUniform = typename std::conditional<
         View<DT, DP...>::Rank == 0,
@@ -2458,9 +2553,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        !Kokkos::Impl::SpaceAccessibility<
+        !Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2477,17 +2572,23 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, dst data is null");
   } else {
-    space.fence();
-    using ViewTypeUniform = typename std::conditional<
-        View<DT, DP...>::Rank == 0,
-        typename View<DT, DP...>::uniform_runtime_type,
-        typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+    space.fence("Kokkos::deep_copy: scalar-to-view copy on space, pre copy");
     using fill_exec_space = typename dst_traits::memory_space::execution_space;
-    Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
-                           fill_exec_space>(dst, value, fill_exec_space());
-    fill_exec_space().fence();
+    if (dst.span_is_contiguous()) {
+      Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
+    } else {
+      using ViewTypeUniform = typename std::conditional<
+          View<DT, DP...>::Rank == 0,
+          typename View<DT, DP...>::uniform_runtime_type,
+          typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+      Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
+                             fill_exec_space>(dst, value, fill_exec_space());
+    }
+    fill_exec_space().fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, fence after fill");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2501,7 +2602,7 @@ inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize,
                      void>::value>::type* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
@@ -2517,7 +2618,8 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-scalar copy on space, src data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2538,7 +2640,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
@@ -2562,7 +2664,8 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-view copy on space, data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2588,7 +2691,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
@@ -2662,21 +2765,19 @@ inline void deep_copy(
 
   enum {
     ExecCanAccessSrcDst =
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         dst_memory_space>::accessible &&
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecSpace, dst_memory_space>::accessible &&
+        Kokkos::SpaceAccessibility<ExecSpace, src_memory_space>::accessible
   };
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Error out for non-identical overlapping views.
@@ -2757,9 +2858,13 @@ inline void deep_copy(
       using cpy_exec_space =
           typename std::conditional<DstExecCanAccessSrc, dst_execution_space,
                                     src_execution_space>::type;
-      exec_space.fence();
+      exec_space.fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, pre "
+          "copy");
       Impl::view_copy(cpy_exec_space(), dst, src);
-      cpy_exec_space().fence();
+      cpy_exec_space().fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, post "
+          "copy");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
@@ -2777,6 +2882,19 @@ inline void deep_copy(
 
 namespace Kokkos {
 
+namespace Impl {
+template <typename ViewType>
+bool size_mismatch(const ViewType& view, unsigned int max_extent,
+                   const size_t new_extents[8]) {
+  for (unsigned int dim = 0; dim < max_extent; ++dim)
+    if (new_extents[dim] != view.extent(dim)) {
+      return true;
+    }
+  return false;
+}
+
+}  // namespace Impl
+
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
@@ -2798,67 +2916,6 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2866,11 +2923,17 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
@@ -2895,67 +2958,6 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2963,19 +2965,64 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
-                      n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
+                        n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    // This fence really ought to look for an execution space in
+    // arg_prop, and just fence that if there is one
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
-inline void resize(Kokkos::View<T, P...>& v,
-                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutStride>::value ||
+    is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type = Kokkos::View<T, P...>;
+
+  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
+                "Can only resize managed views");
+
+  if (v.layout() != layout) {
+    view_type v_resized(v.label(), layout);
+
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
+}
+
+// FIXME User-provided (custom) layouts are not required to have a comparison
+// operator. Hence, there is no way to check if the requested layout is actually
+// the same as the existing one.
+template <class T, class... P>
+inline std::enable_if_t<
+    !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutLeft>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutRight>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutStride>::value ||
+      is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
   using view_type = Kokkos::View<T, P...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
@@ -3009,10 +3056,16 @@ realloc(Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
 
-  const std::string label = v.label();
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  v = view_type();  // Deallocate first, if the only view to allocation
-  v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  if (sizeMismatch) {
+    const std::string label = v.label();
+
+    v = view_type();  // Deallocate first, if the only view to allocation
+    v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  } else
+    Kokkos::deep_copy(v, typename view_type::value_type{});
 }
 
 /** \brief  Resize a view with discarding old data. */
@@ -3209,7 +3262,8 @@ create_mirror_view_and_copy(
         Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
         nullptr) {
   (void)name;
-  fence();  // same behavior as deep_copy(src, src)
+  fence(
+      "Kokkos::create_mirror_view_and_copy: fence before returning src view");  // same behavior as deep_copy(src, src)
   return src;
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index c3771ab393..60e748589d 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -59,6 +59,7 @@
 #include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -74,6 +75,7 @@
 #include <iosfwd>
 #include <map>
 #include <memory>
+#include <vector>
 
 //----------------------------------------------------------------------------
 
@@ -121,6 +123,7 @@ class ExecSpaceManager {
   void initialize_spaces(const Kokkos::InitArguments& args);
   void finalize_spaces(const bool all_spaces);
   void static_fence();
+  void static_fence(const std::string&);
   void print_configuration(std::ostream& msg, const bool detail);
   static ExecSpaceManager& get_instance();
 };
@@ -184,6 +187,7 @@ void push_finalize_hook(std::function<void()> f);
 void finalize_all();
 
 void fence();
+void fence(const std::string&);
 
 /** \brief Print "Bill of Materials" */
 void print_configuration(std::ostream&, const bool detail = false);
@@ -274,6 +278,44 @@ class ScopeGuard {
 
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+template <class ExecSpace, class... Args>
+std::vector<ExecSpace> partition_space(ExecSpace space, Args...) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<ExecSpace> instances(sizeof...(Args));
+  for (int s = 0; s < int(sizeof...(Args)); s++) instances[s] = space;
+  return instances;
+}
+
+template <class ExecSpace, class T>
+std::vector<ExecSpace> partition_space(ExecSpace space,
+                                       std::vector<T>& weights) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<ExecSpace> instances(weights.size());
+  for (int s = 0; s < int(weights.size()); s++) instances[s] = space;
+  return instances;
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
 #include <Kokkos_Crs.hpp>
 #include <Kokkos_WorkGraphPolicy.hpp>
 // Including this in Kokkos_Parallel_Reduce.hpp led to a circular dependency
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index fe7eba3f6e..a610ee76df 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -53,7 +53,9 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #include <Kokkos_MasterLock.hpp>
+#endif
 
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
@@ -238,7 +240,8 @@ class LogicalMemorySpace;
 
 namespace Kokkos {
 void fence();
-}
+void fence(const std::string &);
+}  // namespace Kokkos
 
 //----------------------------------------------------------------------------
 
@@ -250,9 +253,13 @@ class View;
 namespace Impl {
 
 template <class DstSpace, class SrcSpace,
-          class ExecutionSpace = typename DstSpace::execution_space>
+          class ExecutionSpace = typename DstSpace::execution_space,
+          class Enable         = void>
 struct DeepCopy;
 
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset;
+
 template <class ViewType, class Layout = typename ViewType::array_layout,
           class ExecSpace = typename ViewType::execution_space,
           int Rank = ViewType::Rank, typename iType = int64_t>
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index 1a10500b19..897402d376 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -179,7 +179,9 @@ class GetCrsTransposeCounts {
     const closure_type closure(*this,
                                policy_type(0, index_type(in.entries.size())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::GetCrsTransposeCounts::GetCrsTransposeCounts: fence "
+        "after functor execution");
   }
 };
 
@@ -261,7 +263,9 @@ class FillCrsTransposeEntries {
     using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
     const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::FillCrsTransposeEntries::FillCrsTransposeEntries: fence "
+        "after functor execution");
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index 7a218120bb..c5a6b0f7d7 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -55,13 +55,13 @@
 
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
 
@@ -184,8 +184,10 @@ class Cuda {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -199,7 +201,7 @@ class Cuda {
 
   Cuda();
 
-  Cuda(cudaStream_t stream);
+  Cuda(cudaStream_t stream, bool manage_stream = false);
 
   //--------------------------------------------------------------------------
   //! \name Device-specific functions
@@ -246,7 +248,7 @@ class Cuda {
   inline Impl::CudaInternal* impl_internal_space_instance() const {
     return m_space_instance.get();
   }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
@@ -271,9 +273,28 @@ class CudaSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool all_spaces) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space_instance.cuda_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMemset(dst.data(), 0,
+                   dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
index e10fae93c7..910a8b2d74 100644
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -70,6 +70,12 @@ extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(bool);
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_cuda_type_space : public std::false_type {};
+
+}  // namespace Impl
 
 /** \brief  Cuda on-device memory management */
 
@@ -119,10 +125,12 @@ class CudaSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return m_name; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which Cuda device
@@ -130,6 +138,10 @@ class CudaSpace {
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
+
+template <>
+struct Impl::is_cuda_type_space<CudaSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -151,9 +163,11 @@ class CudaUVMSpace {
   /** \brief  If UVM capability is available */
   static bool available();
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  CudaUVMSpace specific routine */
   KOKKOS_DEPRECATED static int number_of_allocations();
+#endif
 
   /*--------------------------------*/
 
@@ -209,6 +223,9 @@ class CudaUVMSpace {
   static constexpr const char* m_name = "CudaUVM";
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaUVMSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -271,6 +288,9 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaHostPinnedSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -411,338 +431,107 @@ struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> {
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyCuda(void* dst, const void* src, size_t n);
+void DeepCopyAsyncCuda(const Cuda& instance, void* dst, const void* src,
+                       size_t n);
 void DeepCopyAsyncCuda(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<CudaSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaUVMSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<HostSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<MemSpace1, MemSpace2, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, HostSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<MemSpace, HostSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp
new file mode 100644
index 0000000000..9e060b343e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_DETECTION_IDIOM_HPP
+#define KOKKOS_DETECTION_IDIOM_HPP
+
+#include <impl/Kokkos_Utilities.hpp>  // void_t
+#include <type_traits>
+
+// NOTE This header implements the detection idiom from Version 2 of the C++
+// Extensions for Library Fundamentals, ISO/IEC TS 19568:2017
+
+// I deliberately omitted detected_or which does not fit well with the rest
+// of the specification. In my opinion, it should be removed from the TS.
+
+namespace Kokkos {
+
+namespace Impl {
+// base class for nonesuch to inherit from so it is not an aggregate
+struct nonesuch_base {};
+
+// primary template handles all types not supporting the archetypal Op
+template <class Default, class /*AlwaysVoid*/, template <class...> class Op,
+          class... /*Args*/>
+struct detector {
+  using value_t = std::false_type;
+  using type    = Default;
+};
+
+// specialization recognizes and handles only types supporting Op
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type    = Op<Args...>;
+};
+}  // namespace Impl
+
+struct nonesuch : private Impl::nonesuch_base {
+  ~nonesuch()               = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename Impl::detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename Impl::detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename Impl::detector<Default, void, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class...> class Op, class... Args>
+inline constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_exact_v =
+    is_detected_exact<Expected, Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_convertible_v =
+    is_detected_convertible<Expected, Op, Args...>::value;
+#endif
+
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 55aed13670..c88c1ada14 100644
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -48,7 +48,6 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <typeinfo>
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index d0366b599c..f6cdb2ec46 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -287,7 +287,10 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, "
+        "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence before "
+        "copy");
     memcpy(dst, src, n);
   }
 };
@@ -297,7 +300,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
@@ -307,7 +312,9 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
diff --git a/lib/kokkos/core/src/Kokkos_HIP.hpp b/lib/kokkos/core/src/Kokkos_HIP.hpp
index 33cf8321c8..09df4f2fed 100644
--- a/lib/kokkos/core/src/Kokkos_HIP.hpp
+++ b/lib/kokkos/core/src/Kokkos_HIP.hpp
@@ -54,7 +54,6 @@
 
 #include <Kokkos_HIP_Space.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
diff --git a/lib/kokkos/core/src/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
index 17bd681aa4..d20d533645 100644
--- a/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -58,6 +58,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>  // HIP_SAFE_CALL
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -67,6 +68,13 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_hip_type_space : public std::false_type {};
+
+}  // namespace Impl
+
 namespace Experimental {
 /** \brief  HIP on-device memory management */
 
@@ -116,10 +124,12 @@ class HIPSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return "HIP"; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access HIPSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which HIP device
@@ -129,6 +139,11 @@ class HIPSpace {
 };
 
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPSpace> : public std::true_type {
+};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -188,6 +203,11 @@ class HIPHostPinnedSpace {
   /*--------------------------------*/
 };
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPHostPinnedSpace>
+    : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -268,174 +288,116 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyHIP(void* dst, const void* src, size_t n);
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      const void* src, size_t n);
 void DeepCopyAsyncHIP(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace1>::value &&
+                                 is_hip_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace1>::value &&
+        is_hip_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>(
-        dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 }  // namespace Impl
@@ -536,7 +498,7 @@ class HIP {
   using scratch_memory_space = ScratchMemorySpace<HIP>;
 
   HIP();
-  HIP(hipStream_t stream);
+  HIP(hipStream_t stream, bool manage_stream = false);
 
   //@}
   //------------------------------------
@@ -558,8 +520,10 @@ class HIP {
    * until all dispatched functors on this device have completed.
    */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   hipStream_t hip_stream() const;
 
@@ -596,7 +560,7 @@ class HIP {
     return m_space_instance.get();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance;
@@ -620,9 +584,28 @@ class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::HIP, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::HIP& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space.hip_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(dst.data(), 0,
+                  dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_HPX.hpp b/lib/kokkos/core/src/Kokkos_HPX.hpp
index 2100b49c11..236211864e 100644
--- a/lib/kokkos/core/src/Kokkos_HPX.hpp
+++ b/lib/kokkos/core/src/Kokkos_HPX.hpp
@@ -69,7 +69,6 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -318,25 +317,50 @@ class HPX {
   }
 
   void impl_fence_instance() const {
-    if (hpx::threads::get_self_ptr() == nullptr) {
-      hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
-    } else {
-      impl_get_future().wait();
-    }
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_instance: Unnamed Instance "
+        "Fence");
+  }
+  void impl_fence_instance(const std::string &name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      if (hpx::threads::get_self_ptr() == nullptr) {
+        hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
+      } else {
+        impl_get_future().wait();
+      }
+    });
   }
 
   void impl_fence_all_instances() const {
-    hpx::util::yield_while(
-        []() { return m_active_parallel_region_count.load() != 0; });
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_all_instances: Unnamed Global "
+        "HPX Fence");
+  }
+  void impl_fence_all_instances(const std::string &namename) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      hpx::util::yield_while(
+          []() { return m_active_parallel_region_count.load() != 0; });
+    });
   }
 #endif
 
   void fence() const {
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
     if (m_mode == instance_mode::global) {
-      impl_fence_all_instances();
+      impl_fence_all_instances(
+          "Kokkos::Experimental::HPX::fence: Unnamed Global HPX Fence");
     } else {
-      impl_fence_instance();
+      impl_fence_instance(
+          "Kokkos::Experimental::HPX::fence: Unnamed HPX Instance Fence");
+    }
+#endif
+  }
+  void fence(const std::string &name) const {
+#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
+    if (m_mode == instance_mode::global) {
+      impl_fence_all_instances(name);
+    } else {
+      impl_fence_instance(name);
     }
 #endif
   }
@@ -464,6 +488,7 @@ class HPXSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments &args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string &) final;
   void print_configuration(std::ostream &msg, const bool detail) final;
 };
 
@@ -491,7 +516,9 @@ inline void dispatch_execute_task(Closure *closure,
   }
 
   if (force_synchronous) {
-    instance.fence();
+    instance.fence(
+        "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
+        "forced syncronizations");
   }
 }
 #else
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index ba69fbad39..c96cf5fbbe 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -299,6 +299,20 @@ namespace Kokkos {
 
 namespace Impl {
 
+template <class DT, class... DP>
+struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
+  ZeroMemset(const typename HostSpace::execution_space&,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type& value)
+      : ZeroMemset(dst, value) {}
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    using ValueType = typename View<DT, DP...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+};
+
 template <class ExecutionSpace>
 struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
@@ -306,9 +320,13 @@ struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     hostspace_parallel_deepcopy(dst, src, n);
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence after copy");
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
index 778b4f0810..cfd77ea50f 100644
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -50,7 +50,6 @@
 
 #include <cstddef>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 namespace Kokkos {
 
@@ -89,6 +88,16 @@ struct LayoutLeft {
                                 size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                 size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutLeft& left, const LayoutLeft& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -123,6 +132,16 @@ struct LayoutRight {
                                  size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                  size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutRight& left, const LayoutRight& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutRight& left, const LayoutRight& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -184,6 +203,18 @@ struct LayoutStride {
                                   size_t S7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3,
                                                           S4, S5, S6, S7} {}
+
+  friend bool operator==(const LayoutStride& left, const LayoutStride& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank] ||
+          left.stride[rank] != right.stride[rank])
+        return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutStride& left, const LayoutStride& right) {
+    return !(left == right);
+  }
 };
 
 // ===================================================================================
@@ -229,18 +260,6 @@ struct LayoutTiled {
   static_assert(IsPowerOfTwo,
                 "LayoutTiled must be given power-of-two tile dimensions");
 
-#if 0
-  static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN1) ) &&
-                 (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
-               , "LayoutTiled must be given power-of-two tile dimensions" );
-#endif
-
   using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3,
                                    ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>;
   static constexpr Iterate outer_pattern = OuterP;
@@ -270,6 +289,16 @@ struct LayoutTiled {
                                  size_t argN4 = 0, size_t argN5 = 0,
                                  size_t argN6 = 0, size_t argN7 = 0)
       : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {}
+
+  friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) {
+    return !(left == right);
+  }
 };
 
 }  // namespace Experimental
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 0d01853465..8d0fd925a2 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -53,11 +53,12 @@
  *  KOKKOS_ENABLE_HPX                 Kokkos::Experimental::HPX execution space
  *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
  *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget
- * execution space KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *                                    execution space
+ *  KOKKOS_ENABLE_HIP                 Kokkos::Experimental::HIP execution space
+ *  KOKKOS_ENABLE_SYCL                Kokkos::Experimental::SYCL execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
  *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
- *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space
- * interactions. KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory
- * space.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
@@ -211,6 +212,11 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
+// FIXME Workaround for ICE with intel 17,18,19 in Trilinos
+#if (KOKKOS_COMPILER_INTEL <= 1900)
+#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+#endif
+
 // FIXME_SYCL
 #if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
@@ -220,11 +226,19 @@
 #define KOKKOS_MEMORY_ALIGNMENT 64
 #endif
 
+#if defined(_WIN32)
+#define KOKKOS_RESTRICT __restrict
+#else
 #define KOKKOS_RESTRICT __restrict__
+#endif
 
 #ifndef KOKKOS_IMPL_ALIGN_PTR
+#if defined(_WIN32)
+#define KOKKOS_IMPL_ALIGN_PTR(size) __declspec(align_value(size))
+#else
 #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
 #endif
+#endif
 
 #if (1700 > KOKKOS_COMPILER_INTEL)
 #error "Compiling with Intel version earlier than 17.0 is not supported."
@@ -507,24 +521,44 @@
 #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
+// FIXME_SYCL Tasks not implemented
 #elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
-#if (__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-#endif
-#endif
-
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #define KOKKOS_CONSTEXPR_14 constexpr
-#define KOKKOS_DEPRECATED [[deprecated]]
 #define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE
+#endif
+
+// Guard intel compiler version <= 1900
+// intel error #2651: attribute does not apply to any entity
+// using <deprecated_type> KOKKOS_DEPRECATED = ...
+#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) && \
+    (KOKKOS_COMPILER_INTEL > 1900)
+#define KOKKOS_DEPRECATED [[deprecated]]
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]]
+#else
+#define KOKKOS_DEPRECATED
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment)
+#endif
+
+#define KOKKOS_IMPL_STRINGIFY(x) #x
+#define KOKKOS_IMPL_TOSTRING(x) KOKKOS_IMPL_STRINGIFY(x)
+
+#ifdef _MSC_VER
+#define KOKKOS_IMPL_DO_PRAGMA(x) __pragma(x)
+#define KOKKOS_IMPL_WARNING(desc) \
+  KOKKOS_IMPL_DO_PRAGMA(message(  \
+      __FILE__ "(" KOKKOS_IMPL_TOSTRING(__LINE__) ") : warning: " #desc))
+#else
+#define KOKKOS_IMPL_DO_PRAGMA(x) _Pragma(#x)
+#define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc))
+#endif
 
 // DJS 05/28/2019: Bugfix: Issue 2155
 // Use KOKKOS_ENABLE_CUDA_LDG_INTRINSIC to avoid memory leak in RandomAccess
@@ -541,7 +575,7 @@
 
 #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
      defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
-    !defined(KOKKOS_COMPILER_MSVC)
+    !defined(_WIN32)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
 #define KOKKOS_IMPL_ENABLE_CXXABI
 #endif
@@ -553,7 +587,8 @@
 #undef __CUDA_ARCH__
 #endif
 
-#if defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)
+#if (defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)) || \
+    (defined(KOKKOS_COMPILER_INTEL) && defined(_WIN32))
 #define KOKKOS_THREAD_LOCAL __declspec(thread)
 #else
 #define KOKKOS_THREAD_LOCAL __thread
diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
index 3c45e131a0..cbfbb92660 100644
--- a/lib/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -47,6 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -72,4 +74,6 @@ class MasterLock;
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#endif
+
 #endif  // KOKKOS_MASTER_LOCK_HPP
diff --git a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 50223651e7..50fde82d77 100644
--- a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -55,116 +55,224 @@
 #endif
 
 namespace Kokkos {
+
+namespace Impl {
+template <class T, bool = std::is_integral<T>::value>
+struct promote {
+  using type = double;
+};
+template <class T>
+struct promote<T, false> {};
+template <>
+struct promote<long double> {
+  using type = long double;
+};
+template <>
+struct promote<double> {
+  using type = double;
+};
+template <>
+struct promote<float> {
+  using type = float;
+};
+template <class T>
+using promote_t = typename promote<T>::type;
+template <class T, class U>
+struct promote_2 {
+  using type = decltype(promote_t<T>() + promote_t<U>());
+};
+template <class T, class U>
+using promote_2_t = typename promote_2<T, U>::type;
+}  // namespace Impl
+
 namespace Experimental {
 
 #if defined(KOKKOS_ENABLE_SYCL)
-#define NAMESPACE_MATH_FUNCTIONS sycl
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl
 #else
-#define NAMESPACE_MATH_FUNCTIONS std
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std
 #endif
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) {                        \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                                    \
-    return FUNC(x);                                                          \
-  }
-
-#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE)              \
-  template <typename Integer,                                              \
-            typename = std::enable_if_t<std::is_integral<Integer>::value>> \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) {                      \
-    return Kokkos::Experimental::FUNC(static_cast<double>(x));             \
-  }
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \
-  KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) {           \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                      \
-    return FUNC(x, y);                                         \
-  }
-
 // NOTE long double overloads are not available on the device
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
     defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else
+#define KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#endif
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename = std::enable_if_t<                                     \
-                std::is_arithmetic<Arithmetic1>::value &&                    \
-                std::is_arithmetic<Arithmetic2>::value &&                    \
-                !std::is_same<Arithmetic1, long double>::value &&            \
-                !std::is_same<Arithmetic2, long double>::value>>             \
-  KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) {         \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x) {                    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                     \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)   \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)             \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*)
-
-#else  // long double overloads are available
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename =                                                       \
-                std::enable_if_t<std::is_arithmetic<Arithmetic1>::value &&   \
-                                 std::is_arithmetic<Arithmetic2>::value>,    \
-            typename Promoted = std::conditional_t<                          \
-                std::is_same<Arithmetic1, long double>::value ||             \
-                    std::is_same<Arithmetic2, long double>::value,           \
-                long double, double>>                                        \
-  KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) {       \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(long double x) {                         \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                               \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)             \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double)           \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                               \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x, long double y) {    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x, long double y) { \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  template <class T1, class T2>                                              \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic<T1>::value &&   \
+                                              std::is_arithmetic<T2>::value, \
+                                          Kokkos::Impl::promote_2_t<T1, T2>> \
+  FUNC(T1 x, T2 y) {                                                         \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));         \
+  }
 
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double)      \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
+#else  // long double overloads are not available
 
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)       \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double)      \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
+  }
 
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*)
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
+  }
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                          \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  template <class T1, class T2>                                         \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<                              \
+      std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value && \
+          !std::is_same<T1, long double>::value &&                      \
+          !std::is_same<T2, long double>::value,                        \
+      Kokkos::Impl::promote_2_t<T1, T2>>                                \
+  FUNC(T1 x, T2 y) {                                                    \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));    \
+  }
 
 #endif
 
 // Basic operations
+KOKKOS_INLINE_FUNCTION int abs(int n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long abs(long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long long abs(long long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION float abs(float x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+KOKKOS_INLINE_FUNCTION double abs(double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double abs(long double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
@@ -172,7 +280,18 @@ KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
 #ifndef KOKKOS_ENABLE_SYCL
-KOKKOS_IMPL_MATH_NAN()
+KOKKOS_INLINE_FUNCTION float nanf(char const* arg) { return ::nanf(arg); }
+KOKKOS_INLINE_FUNCTION double nan(char const* arg) { return ::nan(arg); }
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double nanl(char const* arg) { return ::nanl(arg); }
+#endif
+#else
+// FIXME_SYCL
+// sycl::nan does not follow the C/C++ standard library and takes an unsigned
+// integer as argument.  The current implementation does not attempt to convert
+// the character string arg into the quiet NaN value.
+KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
+KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 // Power functions
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
@@ -211,6 +330,7 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
+// FIXME_SYCL not available as of current SYCL specification v1.2.1
 #ifndef KOKKOS_ENABLE_SYCL
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
 #endif
@@ -219,14 +339,12 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
 
-#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL
-#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
-#undef KOKKOS_IMPL_MATH_NAN
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000..7bcea91c86
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -0,0 +1,1280 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+#define KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+#include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_Complex.hpp>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+//! Compute exponential integral E1(x) (x > 0).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType expint1(RealType x) {
+  // This function is a conversion of the corresponding Fortran program in
+  // S. Zhang & J. Jin "Computation of Special Functions" (Wiley, 1996).
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::log;
+  using Kokkos::Experimental::pow;
+
+  RealType e1;
+
+  if (x < 0) {
+    e1 = -infinity<RealType>::value;
+  } else if (x == 0.0) {
+    e1 = infinity<RealType>::value;
+  } else if (x <= 1.0) {
+    e1         = 1.0;
+    RealType r = 1.0;
+    for (int k = 1; k <= 25; k++) {
+      RealType k_real = static_cast<RealType>(k);
+      r               = -r * k_real * x / pow(k_real + 1.0, 2.0);
+      e1              = e1 + r;
+      if (fabs(r) <= fabs(e1) * epsilon<RealType>::value) break;
+    }
+    e1 = -0.5772156649015328 - log(x) + x * e1;
+  } else {
+    int m       = 20 + static_cast<int>(80.0 / x);
+    RealType t0 = 0.0;
+    for (int k = m; k >= 1; k--) {
+      RealType k_real = static_cast<RealType>(k);
+      t0              = k_real / (1.0 + k_real / (x + t0));
+    }
+    e1 = exp(-x) * (1.0 / (x + t0));
+  }
+  return e1;
+}
+
+//! Compute error function erf(z) for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erf(-z)=-erf(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //  Error condition: abs(z^2) > 670 is a fatal overflow error
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = accum * cz;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(Z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      cz              = -cz;
+      RealType er     = cz.real();
+      RealType ei     = cz.imag();
+      cz              = exp(er) * CmplxType(cos(ei), sin(ei));
+      CmplxType accum = zp * gnorm * cz;
+      cans            = 1.0 - accum / term;
+      if (z.real() < 0.0) cans = -cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex = exp(-x2);
+        w           = w * xp * fnorm * ex;
+        RealType cf = ex / pi;
+        s1          = cf * s1 + w;
+        s2          = cf * s2;
+        cans        = CmplxType(s1, s2);
+        if (z.real() < 0.0) cans = -cans;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum       = accum * gnorm / zp;
+        cz          = -cz;
+        RealType er = cz.real();
+        if (fabs(er) > 670.0) return CmplxType(inf, inf);
+        RealType ei = cz.imag();
+        cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+        cans        = 1.0 - accum * cz;
+        if (z.real() < 0.0) cans = -cans;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(z)=exp(z^2)*erfc(z)
+//! for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erfc(-z)=2-erfc(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  // Error condition: abs(z^2) > 670 is a fatal overflow error when x<0
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::isinf;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  if ((isinf(z.real())) && (z.real() > 0)) {
+    cans = CmplxType(0.0, 0.0);
+    return cans;
+  }
+  if ((isinf(z.real())) && (z.real() < 0)) {
+    cans = CmplxType(inf, inf);
+    return cans;
+  }
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = 1.0 / cz - accum;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      cans = zp * gnorm / term;
+      if (z.real() >= 0.0) return cans;
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      ;
+      cz          = -cz;
+      RealType er = cz.real();
+      RealType ei = cz.imag();
+      cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+      cz          = 1.0 / cz;
+      cans        = cz + cz - cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex   = exp(-x2);
+        w             = w * xp * fnorm * ex;
+        CmplxType rcz = CmplxType(cxy, sxy);
+        RealType y2   = yp * yp;
+        cz            = exp(x2 - y2) * rcz;
+        rcz           = exp(-y2) * rcz;
+        if (z.real() >= 0.0)
+          cans = cz * (1.0 - w) - rcz * CmplxType(s1, s2) / pi;
+        else
+          cans = cz * (1.0 + w) + rcz * CmplxType(s1, s2) / pi;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum = accum * gnorm / zp;
+        if (z.real() < 0.0) accum = -accum;
+        cans = accum;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(x)=exp(x^2)*erfc(x)
+//! for real x
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType erfcx(RealType x) {
+  using CmplxType = Kokkos::complex<RealType>;
+  // Note: using erfcx(complex) for now
+  // TODO: replace with an implementation of erfcx(real)
+  CmplxType zin  = CmplxType(x, 0.0);
+  CmplxType zout = erfcx(zin);
+  return zout.real();
+}
+
+//! Compute Bessel function J0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  // Input :  z         --- Complex argument
+  //         joint_val --- Joint point of abs(z) separating small and large
+  //                       argument regions
+  //         bw_start  --- Starting point for backward recurrence
+  // Output:  cbj0      --- J0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+    }
+  }
+  return cbj0;
+}
+
+//! Compute Bessel function Y0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby0      --- Y0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby0, cbj0;
+  const RealType pi    = M_PI;
+  const RealType el    = 0.57721566490153286060651209008240;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby0 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+    } else {  // Using asymptotic expansion (5.2.6) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+      cby0         = cu * (cp0 * Kokkos::sin(ct1) + cq0 * Kokkos::cos(ct1));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby0 = cby0 - 2.0 * ci * cbj0;
+        if (z.imag() >= 0.0) cby0 = cby0 + 2.0 * ci * cbj0;
+      }
+    }
+  }
+  return cby0;
+}
+
+//! Compute Bessel function J1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbj1      --- J1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj1;
+  const RealType pi     = M_PI;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj1 = cbj1 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+
+      if (real(z) < 0.0) {  // Apply (5.4.2)
+        cbj1 = -cbj1;
+      }
+    }
+  }
+  return cbj1;
+}
+
+//! Compute Bessel function Y1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby1      --- Y1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby1, cbj0, cbj1, cby0;
+  const RealType pi     = M_PI;
+  const RealType el     = 0.57721566490153286060651209008240;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby1 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+      cbj1 = cbj1 / cs0;
+      cby1 = (cbj1 * cby0 - 2.0 / (pi * z)) / cbj0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+      cby1         = cu * (cp1 * Kokkos::sin(ct2) + cq1 * Kokkos::cos(ct2));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby1 = -(cby1 - 2.0 * ci * cbj1);
+        if (z.imag() >= 0.0) cby1 = -(cby1 + 2.0 * ci * cbj1);
+      }
+    }
+  }
+  return cby1;
+}
+
+//! Compute modified Bessel function I0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi0      --- I0(z)
+  CmplxType cbi0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {0.125,
+                          7.03125e-2,
+                          7.32421875e-2,
+                          1.1215209960938e-1,
+                          2.2710800170898e-1,
+                          5.7250142097473e-1,
+                          1.7277275025845e0,
+                          6.0740420012735e0,
+                          2.4380529699556e1,
+                          1.1001714026925e2,
+                          5.5133589612202e2,
+                          3.0380905109224e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi0         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi0 = ca * cbi0;
+    }
+  }
+  return cbi0;
+}
+
+//! Compute modified Bessel function K0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k0(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Purpose: Compute modified Bessel function K0(z) of the second kind of
+  //             order zero for a complex argument
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk0      --- K0(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk0 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (0.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk0 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk0 = cbk0 + ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk0 = cbk0 - ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk0;
+}
+
+//! Compute modified Bessel function I1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi1      --- I1(z)
+  CmplxType cbi1;
+  const RealType pi    = M_PI;
+  const RealType b[12] = {-0.375,
+                          -1.171875e-1,
+                          -1.025390625e-1,
+                          -1.4419555664063e-1,
+                          -2.7757644653320e-1,
+                          -6.7659258842468e-1,
+                          -1.9935317337513,
+                          -6.8839142681099,
+                          -2.7248827311269e1,
+                          -1.2159789187654e2,
+                          -6.0384407670507e2,
+                          -3.3022722944809e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi1 = cbi1 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi1         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi1 = cbi1 + b[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi1 = ca * cbi1;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      cbi1 = -cbi1;
+    }
+  }
+  return cbi1;
+}
+
+//! Compute modified Bessel function K1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k1(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk1      --- K1(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0, cbk1, cbi1;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk1 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbi1 = cbi1 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+      cbk1 = (1.0 / z1 - cbi1 * cbk0) / cbi0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (4.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk1 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk1 = -cbk1 - ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk1 = -cbk1 + ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk1;
+}
+
+//! Compute Hankel function H10(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h10(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch10, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch10 = CmplxType(1.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch10 = cbj0 + ci * cby0;
+  } else {  //(z.imag() > 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch10 = 2.0 / (pi * ci) * cbk0;
+  }
+
+  return ch10;
+}
+
+//! Compute Hankel function H11(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h11(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch11, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch11 = CmplxType(0.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch11 = cbj1 + ci * cby1;
+  } else {  //(z.imag() > 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch11 = -2.0 / pi * cbk1;
+  }
+
+  return ch11;
+}
+
+//! Compute Hankel function H20(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h20(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch20, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch20 = CmplxType(1.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch20 = cbj0 - ci * cby0;
+  } else {  //(z.imag() < 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch20 = 2.0 / pi * ci * cbk0;
+  }
+
+  return ch20;
+}
+
+//! Compute Hankel function H21(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h21(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch21, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch21 = CmplxType(0.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch21 = cbj1 - ci * cby1;
+  } else {  //(z.imag() < 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch21 = -2.0 / pi * cbk1;
+  }
+
+  return ch21;
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2cafac1aea..c814e5a22a 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -524,7 +524,7 @@ class MemoryPool {
     // Fast query clock register 'tic' to pseudo-randomize
     // the guess for which block within a superblock should
     // be claimed.  If not available then a search occurs.
-#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
+#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
     const uint32_t block_id_hint = alloc_size;
 #else
     const uint32_t block_id_hint =
@@ -585,19 +585,6 @@ class MemoryPool {
               (uint64_t(sb_id) << m_sb_size_lg2)       // superblock memory
               + (uint64_t(result.first) << size_lg2);  // block memory
 
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , alloc_size
-        , sb_id
-        , sb_state 
-        , (1u << size_lg2)
-        , (1u << count_lg2)
-        , result.first 
-        , result.second );
-#endif
-
           break;  // Success
         }
       }
@@ -740,7 +727,8 @@ class MemoryPool {
 
     // Determine which superblock and block
     const ptrdiff_t d =
-        ((char *)p) - ((char *)(m_sb_state_array + m_data_offset));
+        static_cast<char *>(p) -
+        reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
 
     // Verify contained within the memory pool's superblocks:
     const int ok_contains =
@@ -772,29 +760,10 @@ class MemoryPool {
         const int result = CB::release(sb_state_array, bit, block_state);
 
         ok_dealloc_once = 0 <= result;
-
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , sb_id
-        , (1u << block_size_lg2)
-        , (1u << (m_sb_size_lg2 - block_size_lg2))
-        , bit
-        , result );
-#endif
       }
     }
 
     if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , int(ok_contains)
-        , int(ok_block_aligned)
-        , int(ok_dealloc_once) );
-#endif
       Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
     }
   }
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
index f23442b793..e3cee93e25 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -46,7 +46,6 @@
 #define KOKKOS_MEMORYTRAITS_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -119,6 +118,15 @@ enum : unsigned {
   MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
 };
 
+// ------------------------------------------------------------------ //
+//  this identifies the default memory trait
+//
+template <typename Tp>
+struct is_default_memory_trait : std::false_type {};
+
+template <>
+struct is_default_memory_trait<Kokkos::MemoryTraits<0>> : std::true_type {};
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
index b9380cbe02..1999d46f3c 100644
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -56,11 +56,11 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 // clang-format off
-template <class> struct infinity_helper;
+template <class> struct infinity_helper {};
 template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; };
 template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; };
 template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; };
-template <class> struct finite_min_helper;
+template <class> struct finite_min_helper {};
 template <> struct finite_min_helper<bool> { static constexpr bool value = false; };
 template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; };
 template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; };
@@ -76,7 +76,7 @@ template <> struct finite_min_helper<unsigned long long int> { static constexpr
 template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; };
 template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; };
 template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; };
-template <class> struct finite_max_helper;
+template <class> struct finite_max_helper {};
 template <> struct finite_max_helper<bool> { static constexpr bool value = true; };
 template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; };
 template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; };
@@ -92,7 +92,7 @@ template <> struct finite_max_helper<unsigned long long int> { static constexpr
 template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; };
 template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; };
 template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; };
-template <class> struct epsilon_helper;
+template <class> struct epsilon_helper {};
 namespace{
   // FIXME workaround for LDL_EPSILON with XL
   template<typename T>
@@ -115,15 +115,15 @@ template <> struct epsilon_helper<long double> {
   static constexpr long double value = LDBL_EPSILON;
 #endif
 };
-template <class> struct round_error_helper;
+template <class> struct round_error_helper {};
 template <> struct round_error_helper<float> { static constexpr float value = 0.5F; };
 template <> struct round_error_helper<double> { static constexpr double value = 0.5; };
 template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; };
-template <class> struct norm_min_helper;
+template <class> struct norm_min_helper {};
 template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; };
 template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; };
 template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; };
-template <class> struct digits_helper;
+template <class> struct digits_helper {};
 template <> struct digits_helper<bool> { static constexpr int value = 1; };
 template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; };
 template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; };
@@ -139,11 +139,13 @@ template <> struct digits_helper<unsigned long long int> { static constexpr int
 template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; };
 template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; };
 template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; };
-template <class> struct digits10_helper;
+template <class> struct digits10_helper {};
 template <> struct digits10_helper<bool> { static constexpr int value = 0; };
-constexpr double log10_2 = 2.41;
+// The fraction 643/2136 approximates log10(2) to 7 significant digits.
+// Workaround GCC compiler bug with -frounding-math that prevented the
+// floating-point expression to be evaluated at compile time.
 #define DIGITS10_HELPER_INTEGRAL(TYPE) \
-template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; };
+template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * 643L / 2136; };
 DIGITS10_HELPER_INTEGRAL(char)
 DIGITS10_HELPER_INTEGRAL(signed char)
 DIGITS10_HELPER_INTEGRAL(unsigned char)
@@ -159,15 +161,29 @@ DIGITS10_HELPER_INTEGRAL(unsigned long long int)
 template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; };
 template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; };
 template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; };
-template <class> struct max_digits10_helper;
-// FIXME not sure why were not defined in my <cfloat>
-//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
-template <> struct max_digits10_helper<float> { static constexpr int value = 9; };
-template <> struct max_digits10_helper<double> { static constexpr int value = 17; };
-template <> struct max_digits10_helper<long double> { static constexpr int value = 21; };
-template <class> struct radix_helper;
+template <class> struct max_digits10_helper {};
+// Approximate ceil(digits<T>::value * log10(2) + 1)
+#define MAX_DIGITS10_HELPER(TYPE) \
+template <> struct max_digits10_helper<TYPE> { static constexpr int value = (digits_helper<TYPE>::value * 643L + 2135) / 2136 + 1; };
+#ifdef FLT_DECIMAL_DIG
+template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(float)
+#endif
+#ifdef DBL_DECIMAL_DIG
+template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(double)
+#endif
+#ifdef DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = DECIMAL_DIG; };
+#elif LDBL_DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(long double)
+#endif
+#undef MAX_DIGITS10_HELPER
+template <class> struct radix_helper {};
 template <> struct radix_helper<bool> { static constexpr int value = 2; };
 template <> struct radix_helper<char> { static constexpr int value = 2; };
 template <> struct radix_helper<signed char> { static constexpr int value = 2; };
@@ -183,19 +199,19 @@ template <> struct radix_helper<unsigned long long int> { static constexpr int v
 template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; };
-template <class> struct min_exponent_helper;
+template <class> struct min_exponent_helper {};
 template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; };
 template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; };
 template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; };
-template <class> struct min_exponent10_helper;
+template <class> struct min_exponent10_helper {};
 template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; };
 template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; };
 template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; };
-template <class> struct max_exponent_helper;
+template <class> struct max_exponent_helper {};
 template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; };
 template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; };
 template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; };
-template <class> struct max_exponent10_helper;
+template <class> struct max_exponent10_helper{};
 template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; };
 template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; };
 template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; };
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index eedba38a84..8f12eceb27 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -105,9 +104,11 @@ class OpenMP {
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
   ///  This is a no-op on OpenMP
-  static void impl_static_fence(OpenMP const& = OpenMP()) noexcept;
+  static void impl_static_fence(OpenMP const&           = OpenMP(),
+                                const std::string& name = "") noexcept;
 
   void fence() const;
+  void fence(const std::string& name) const;
 
   /// \brief Does the given instance return immediately after launching
   /// a parallel algorithm
@@ -167,7 +168,7 @@ class OpenMP {
   static int impl_get_current_max_threads() noexcept;
 
   static constexpr const char* name() noexcept { return "OpenMP"; }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 };
 
 namespace Tools {
@@ -188,6 +189,7 @@ class OpenMPSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 2a57a43e63..f394f32408 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -56,9 +56,8 @@
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -92,7 +91,10 @@ class OpenMPTarget {
   inline static bool in_parallel() { return omp_in_parallel(); }
 
   static void fence();
+  static void fence(const std::string&);
 
+  static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
@@ -115,7 +117,7 @@ class OpenMPTarget {
   }
 
   OpenMPTarget();
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Impl::OpenMPTargetInternal* m_space_instance;
@@ -141,6 +143,7 @@ class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index 58d723ac11..c1d338331f 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -89,6 +89,41 @@ namespace Impl {
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -213,7 +248,10 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence "
+        "before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -231,7 +269,9 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace,
                                        omp_get_initial_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -249,7 +289,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_initial_device(),
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index 85d1dad454..25ebe26155 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -48,23 +48,19 @@
 #ifndef KOKKOS_PARALLEL_HPP
 #define KOKKOS_PARALLEL_HPP
 
-#include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_View.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_View.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
-#include <type_traits>
-#include <typeinfo>
-
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#ifdef KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-#include <iostream>
-#endif
+#include <cstddef>
+#include <type_traits>
+#include <typeinfo>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -72,34 +68,11 @@
 namespace Kokkos {
 namespace Impl {
 
-template <class T, class = void>
-struct is_detected_execution_space : std::false_type {
-  using type = not_a_type;
-};
+template <class T>
+using execution_space_t = typename T::execution_space;
 
 template <class T>
-struct is_detected_execution_space<T, void_t<typename T::execution_space>>
-    : std::true_type {
-  using type = typename T::execution_space;
-};
-
-template <class T>
-using detected_execution_space_t =
-    typename is_detected_execution_space<T>::type;
-
-template <class T, class = void>
-struct is_detected_device_type : std::false_type {
-  using type = not_a_type;
-};
-
-template <class T>
-struct is_detected_device_type<T, void_t<typename T::device_type>>
-    : std::true_type {
-  using type = typename T::device_type;
-};
-
-template <class T>
-using detected_device_type_t = typename is_detected_device_type<T>::type;
+using device_type_t = typename T::device_type;
 
 //----------------------------------------------------------------------------
 /** \brief  Given a Functor and Execution Policy query an execution space.
@@ -112,16 +85,14 @@ using detected_device_type_t = typename is_detected_device_type<T>::type;
 
 template <class Functor, class Policy>
 struct FunctorPolicyExecutionSpace {
-  using execution_space = std::conditional_t<
-      is_detected_execution_space<Policy>::value,
-      detected_execution_space_t<Policy>,
-      std::conditional_t<
-          is_detected_execution_space<Functor>::value,
-          detected_execution_space_t<Functor>,
+  using execution_space = detected_or_t<
+      detected_or_t<
           std::conditional_t<
-              is_detected_device_type<Functor>::value,
-              detected_execution_space_t<detected_device_type_t<Functor>>,
-              Kokkos::DefaultExecutionSpace>>>;
+              is_detected<device_type_t, Functor>::value,
+              detected_t<execution_space_t, detected_t<device_type_t, Functor>>,
+              Kokkos::DefaultExecutionSpace>,
+          execution_space_t, Functor>,
+      execution_space_t, Policy>;
 };
 
 }  // namespace Impl
@@ -158,8 +129,7 @@ inline void parallel_for(
     const ExecPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecPolicy>::value>::type* = nullptr) {
   uint64_t kpID = 0;
 
   ExecPolicy inner_policy = policy;
@@ -200,18 +170,7 @@ inline void parallel_for(const size_t work_count, const FunctorType& functor,
 template <class ExecPolicy, class FunctorType>
 inline void parallel_for(const std::string& str, const ExecPolicy& policy,
                          const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_for(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
@@ -255,9 +214,12 @@ namespace Kokkos {
 ///   // operator() or join().
 ///   using value_type = PodType;
 ///
-///   void operator () (const ExecPolicy::member_type & i, value_type& update,
-///   const bool final_pass) const; void init (value_type& update) const; void
-///   join (volatile value_type& update, volatile const value_type& input) const
+///   void operator () (const ExecPolicy::member_type & i,
+///                     value_type& update,
+///                     const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update,
+//               volatile const value_type& input) const
 /// };
 /// \endcode
 ///
@@ -389,8 +351,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -430,18 +391,7 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 template <class ExecutionPolicy, class FunctorType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
@@ -449,8 +399,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     ReturnType& return_value, const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -464,7 +413,8 @@ inline void parallel_scan(
 
   Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID);
 
-  policy.space().fence();
+  policy.space().fence(
+      "Kokkos::parallel_scan: fence due to result being a value, not a view");
 }
 
 template <class FunctorType, class ReturnType>
@@ -491,25 +441,15 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 
   Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
 
-  execution_space().fence();
+  execution_space().fence(
+      "Kokkos::parallel_scan: fence after scan with return value");
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor,
                           ReturnType& return_value) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, return_value, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 96242f99b0..bc613cea62 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -811,7 +811,7 @@ struct ParallelReducePolicyType;
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type,
+        Kokkos::is_execution_policy<PolicyType>::value>::type,
     PolicyType, FunctorType> {
   using policy_type = PolicyType;
   static PolicyType policy(const PolicyType& policy_) { return policy_; }
@@ -948,9 +948,10 @@ parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) {
 template <class ExecutionSpace, class... Args>
 struct ParallelReduceFence {
   template <class... ArgsDeduced>
-  static void fence(const ExecutionSpace& ex, ArgsDeduced&&... args) {
+  static void fence(const ExecutionSpace& ex, const std::string& name,
+                    ArgsDeduced&&... args) {
     if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) {
-      ex.fence();
+      ex.fence(name);
     }
   }
 };
@@ -974,7 +975,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> & update ,
  *               volatile const <podType> & input ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> & update ) const ;
  *  };
  * \endcode
@@ -991,7 +991,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> update[] ,
  *               volatile const <podType> input[] ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> update[] ) const ;
  *  };
  * \endcode
@@ -1001,24 +1000,30 @@ struct ParallelReduceFence {
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1030,7 +1035,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1043,33 +1051,42 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // ReturnValue as View or Reducer: take by copy to allow for inline construction
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1082,7 +1099,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1096,7 +1116,10 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // No Return Argument
@@ -1106,8 +1129,7 @@ inline void parallel_reduce(
     const std::string& label, const PolicyType& policy,
     const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
@@ -1131,8 +1153,7 @@ template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const PolicyType& policy, const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/lib/kokkos/core/src/Kokkos_Rank.hpp
similarity index 71%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
rename to lib/kokkos/core/src/Kokkos_Rank.hpp
index 26dc9b0e00..3603e28608 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
+++ b/lib/kokkos/core/src/Kokkos_Rank.hpp
@@ -42,5 +42,30 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
+#ifndef KOKKOS_KOKKOS_RANK_HPP
+#define KOKKOS_KOKKOS_RANK_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>  // Iterate
+
+namespace Kokkos {
+
+// Iteration Pattern
+template <unsigned N, Iterate OuterDir = Iterate::Default,
+          Iterate InnerDir = Iterate::Default>
+struct Rank {
+  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert(N != 1u,
+                "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank                = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_RANK_HPP
diff --git a/lib/kokkos/core/src/Kokkos_SYCL.hpp b/lib/kokkos/core/src/Kokkos_SYCL.hpp
index 8ee76b4386..02095ff7b3 100644
--- a/lib/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/lib/kokkos/core/src/Kokkos_SYCL.hpp
@@ -83,7 +83,9 @@ class SYCL {
   SYCL();
   explicit SYCL(const sycl::queue&);
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept {
+    return m_space_instance->impl_get_instance_id();
+  }
 
   sycl::context sycl_context() const noexcept {
     return m_space_instance->m_queue->get_context();
@@ -110,7 +112,9 @@ class SYCL {
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   void fence() const;
+  void fence(const std::string&) const;
 
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream&, const bool detail = false);
@@ -165,6 +169,7 @@ class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -181,6 +186,41 @@ struct DeviceTypeTraits<Kokkos::Experimental::SYCL> {
 }  // namespace Experimental
 }  // namespace Tools
 
+namespace Experimental {
+template <class... Args>
+std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(sizeof...(Args));
+  for (unsigned int i = 0; i < sizeof...(Args); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+
+template <class T>
+std::vector<SYCL> partition_space(const SYCL& sycl_space,
+                                  std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(weights.size());
+  for (unsigned int i = 0; i < weights.size(); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+}  // namespace Experimental
+
 }  // namespace Kokkos
 
 #endif
diff --git a/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
index 392ab0e59a..15ef11024d 100644
--- a/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -49,12 +49,19 @@
 
 #ifdef KOKKOS_ENABLE_SYCL
 #include <Kokkos_Concepts.hpp>
+#include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
 namespace Kokkos {
+
+namespace Impl {
+template <typename T>
+struct is_sycl_type_space : public std::false_type {};
+}  // namespace Impl
+
 namespace Experimental {
 
 class SYCLDeviceUSMSpace {
@@ -118,9 +125,54 @@ class SYCLSharedUSMSpace {
  private:
   sycl::queue m_queue;
 };
+
+class SYCLHostUSMSpace {
+ public:
+  using execution_space = HostSpace::execution_space;
+  using memory_space    = SYCLHostUSMSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = Impl::SYCLInternal::size_type;
+
+  SYCLHostUSMSpace();
+  explicit SYCLHostUSMSpace(sycl::queue queue);
+
+  void* allocate(const std::size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+
+ public:
+  static constexpr const char* name() { return "SYCLHostUSM"; };
+
+ private:
+  sycl::queue m_queue;
+};
+
 }  // namespace Experimental
 
 namespace Impl {
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLDeviceUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLSharedUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace>
+    : public std::true_type {};
+
 static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLDeviceUSMSpace,
                   Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
@@ -131,6 +183,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
               "");
 
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+                  Kokkos::Experimental::SYCLDeviceUSMSpace,
+                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+              "");
+
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
                          Kokkos::Experimental::SYCLDeviceUSMSpace> {
@@ -148,6 +205,16 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // HostSpace::execution_space ==
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                          Kokkos::HostSpace> {
@@ -165,6 +232,18 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLDeviceUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLDeviceUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 // SYCLSharedUSMSpace::execution_space == SYCL
 // SYCLSharedUSMSpace accessible to both SYCL and Host
@@ -191,17 +270,46 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
 };
 
 template <>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLDeviceUSMSpace,
-    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLSharedUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
   enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLSharedUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };  // Cannot access from SYCL
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  enum : bool { assignable = false };  // Cannot access from Host
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  enum : bool { assignable = false };  // different execution_space
+  enum : bool { accessible = true };   // same accessibility
+  enum : bool { deepcopy = true };
 };
 
 template <>
 struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLSharedUSMSpace,
+    Kokkos::Experimental::SYCLDeviceUSMSpace,
     Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
   enum : bool { assignable = false };
   enum : bool { accessible = true };
@@ -276,6 +384,37 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLHostUSMSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLHostUSMSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
+
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::SYCLHostUSMSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
+
 }  // namespace Impl
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 2eebf5365e..bb740cfb86 100644
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -148,10 +148,10 @@ class ScratchMemorySpace {
                                             const IntType& size_L0,
                                             void* ptr_L1           = nullptr,
                                             const IntType& size_L1 = 0)
-      : m_iter_L0((char*)ptr_L0),
-        m_iter_L1((char*)ptr_L1),
-        m_end_L0((char*)ptr_L0 + size_L0),
-        m_end_L1((char*)ptr_L1 + size_L1),
+      : m_iter_L0(static_cast<char*>(ptr_L0)),
+        m_iter_L1(static_cast<char*>(ptr_L1)),
+        m_end_L0(static_cast<char*>(ptr_L0) + size_L0),
+        m_end_L1(static_cast<char*>(ptr_L1) + size_L1),
         m_multiplier(1),
         m_offset(0),
         m_default_level(0) {}
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index 4d5bb2410b..9c8ae70721 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -53,6 +53,8 @@
 
 #include <cstddef>
 #include <iosfwd>
+#include <mutex>
+#include <thread>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -60,12 +62,12 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
@@ -73,6 +75,32 @@
 
 namespace Kokkos {
 
+namespace Impl {
+class SerialInternal {
+ public:
+  SerialInternal() = default;
+
+  bool is_initialized();
+
+  void initialize();
+
+  void finalize();
+
+  static SerialInternal& singleton();
+
+  std::mutex m_thread_team_data_mutex;
+
+  // Resize thread team data scratch memory
+  void resize_thread_team_data(size_t pool_reduce_bytes,
+                               size_t team_reduce_bytes,
+                               size_t team_shared_bytes,
+                               size_t thread_local_bytes);
+
+  HostThreadTeamData m_thread_team_data;
+  bool m_is_initialized = false;
+};
+}  // namespace Impl
+
 /// \class Serial
 /// \brief Kokkos device for non-parallel execution
 ///
@@ -107,6 +135,8 @@ class Serial {
 
   //@}
 
+  Serial();
+
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
   ///
@@ -121,9 +151,26 @@ class Serial {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence() {}
+  static void impl_static_fence() {
+    impl_static_fence(
+        "Kokkos::Serial::impl_static_fence: Unnamed Static Fence");
+  }
+  static void impl_static_fence(const std::string& name) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
-  void fence() const {}
+  void fence() const { fence("Kokkos::Serial::fence: Unnamed Instance Fence"); }
+  void fence(const std::string& name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() { return 1; }
@@ -153,9 +200,24 @@ class Serial {
     return impl_thread_pool_size(0);
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
+
+  Impl::SerialInternal* impl_internal_space_instance() const {
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    return m_space_instance;
+#else
+    return m_space_instance.get();
+#endif
+  }
+
+ private:
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+  Impl::SerialInternal* m_space_instance;
+#else
+  Kokkos::Impl::HostSharedPtr<Impl::SerialInternal> m_space_instance;
+#endif
   //--------------------------------------------------------------------------
 };
 
@@ -177,6 +239,7 @@ class SerialSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -206,20 +269,6 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 namespace Kokkos {
 namespace Impl {
 
-// Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes);
-
-HostThreadTeamData* serial_get_thread_team_data();
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
@@ -510,13 +559,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -606,13 +661,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
   }
@@ -667,13 +727,18 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
 
@@ -797,13 +862,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -869,6 +940,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Member = typename Policy::member_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const int m_shared;
 
@@ -896,16 +968,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    this->template exec<typename Policy::work_tag>(data);
+    this->template exec<typename Policy::work_tag>(
+        internal_instance->m_thread_team_data);
   }
 
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                  FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
@@ -941,6 +1018,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename Analysis::reference_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const ReducerType m_reducer;
   pointer_type m_result_ptr;
@@ -973,18 +1051,24 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
 
-    this->template exec<WorkTag>(data, update);
+    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
 
     Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
         ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -998,6 +1082,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -1016,6 +1101,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
index 91e079a0e7..9751fab460 100644
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -43,5 +43,9 @@
 */
 
 // For backward compatibility:
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING(
+    "This file is deprecated. Use <Kokkos_TaskScheduler.hpp> instead.")
 
 #include <Kokkos_TaskScheduler.hpp>
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 743273670c..17e78f5e81 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -372,7 +371,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         task_base* const t = arg[i].m_task;
         if (nullptr != t) {
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(t->m_ref_count));
+          // This likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           if (q != static_cast<queue_type const*>(t->m_queue)) {
             Kokkos::abort(
                 "Kokkos when_all Futures must be in the same scheduler");
@@ -467,7 +469,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
           //  scheduler" );
           //}
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(arg_f.m_task->m_ref_count));
+          // This increment likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           dep[i] = arg_f.m_task;
         }
       }
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
index e827c2a2a1..da9bea9c23 100644
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -57,7 +57,6 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -65,6 +64,7 @@
 namespace Kokkos {
 namespace Impl {
 class ThreadsExec;
+enum class fence_is_static { yes, no };
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -108,8 +108,10 @@ class Threads {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string& name);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -167,7 +169,7 @@ class Threads {
     return impl_thread_pool_rank();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
   //@}
@@ -192,6 +194,7 @@ class ThreadsSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp
index f7cc34cc11..52edd82052 100644
--- a/lib/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp
@@ -306,7 +306,11 @@ class MultidimensionalSparseTuningProblem {
   static constexpr size_t max_space_dimension_size = MaxDimensionSize;
   static constexpr double tuning_min               = 0.0;
   static constexpr double tuning_max               = 0.999;
-  static constexpr double tuning_step = tuning_max / max_space_dimension_size;
+
+  // Not declared as static constexpr to work around the following compiler bug
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96862
+  // where a floating-point expression cannot be constexpr under -frounding-math
+  double tuning_step = tuning_max / max_space_dimension_size;
 
   using StoredProblemSpace =
       typename Impl::MapTypeConverter<ProblemSpaceInput>::type;
@@ -315,17 +319,45 @@ class MultidimensionalSparseTuningProblem {
 
   using ValueArray = std::array<Kokkos::Tools::Experimental::VariableValue,
                                 space_dimensionality>;
+  template <class Key, class Value>
+  using extended_map = std::map<Key, Value>;
+  template <typename Key>
+  using extended_problem =
+      MultidimensionalSparseTuningProblem<extended_map, MaxDimensionSize, Key,
+                                          ProblemSpaceInput>;
+  template <typename Key, typename Value>
+  using ExtendedProblemSpace =
+      typename Impl::MapTypeConverter<extended_map<Key, Value>>::type;
+
+  template <typename Key>
+  auto extend(const std::string& axis_name,
+              const std::vector<Key>& new_tuning_axis) const
+      -> extended_problem<Key> {
+    ExtendedProblemSpace<Key, ProblemSpaceInput> extended_space;
+    for (auto& key : new_tuning_axis) {
+      extended_space.add_root_value(key);
+      extended_space.add_sub_container(m_space);
+    }
+    std::vector<std::string> extended_names;
+    extended_names.reserve(m_variable_names.size() + 1);
+    extended_names.push_back(axis_name);
+    extended_names.insert(extended_names.end(), m_variable_names.begin(),
+                          m_variable_names.end());
+    return extended_problem<Key>(extended_space, extended_names);
+  }
 
  private:
   StoredProblemSpace m_space;
   std::array<size_t, space_dimensionality> variable_ids;
+  std::vector<std::string> m_variable_names;
   size_t context;
 
  public:
   MultidimensionalSparseTuningProblem() = default;
-  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+
+  MultidimensionalSparseTuningProblem(StoredProblemSpace space,
                                       const std::vector<std::string>& names)
-      : m_space(HierarchyConstructor::build(space)) {
+      : m_space(std::move(space)), m_variable_names(names) {
     assert(names.size() == space_dimensionality);
     for (unsigned long x = 0; x < names.size(); ++x) {
       VariableInfo info;
@@ -340,6 +372,20 @@ class MultidimensionalSparseTuningProblem {
     }
   }
 
+  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+                                      const std::vector<std::string>& names)
+      : MultidimensionalSparseTuningProblem(HierarchyConstructor::build(space),
+                                            names) {}
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    using ArrayType = std::array<Kokkos::Tools::Experimental::VariableValue,
+                                 sizeof...(coordinates)>;
+    return Impl::get_point(
+        m_space, ArrayType({Kokkos::Tools::Experimental::make_variable_value(
+                     0, static_cast<double>(coordinates))...}));
+  }
+
   auto begin() {
     context = Kokkos::Tools::Experimental::get_new_context_id();
     ValueArray values;
@@ -349,12 +395,28 @@ class MultidimensionalSparseTuningProblem {
     }
     begin_context(context);
     request_output_values(context, space_dimensionality, values.data());
-    return get_point(m_space, values);
+    return Impl::get_point(m_space, values);
   }
 
   auto end() { end_context(context); }
 };
 
+template <typename Tuner>
+struct ExtendableTunerMixin {
+  template <typename Key>
+  auto combine(const std::string& axis_name,
+               const std::vector<Key>& new_axis) const {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.extend(axis_name, new_axis);
+  }
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.get_point(coordinates...);
+  }
+};
+
 template <size_t MaxDimensionSize = 100, template <class...> class Container,
           class... TemplateArguments>
 auto make_multidimensional_sparse_tuning_problem(
@@ -362,7 +424,8 @@ auto make_multidimensional_sparse_tuning_problem(
   return MultidimensionalSparseTuningProblem<Container, MaxDimensionSize,
                                              TemplateArguments...>(in, names);
 }
-class TeamSizeTuner {
+
+class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
  private:
   using SpaceDescription = std::map<int64_t, std::vector<int64_t>>;
   using TunerType = decltype(make_multidimensional_sparse_tuning_problem<20>(
@@ -481,7 +544,7 @@ class TeamSizeTuner {
     }
   }
 
- private:
+  TunerType get_tuner() const { return tuner; }
 };
 
 namespace Impl {
@@ -501,7 +564,7 @@ void fill_tile(std::map<T, Mapped>& cont, int tile_size) {
 }  // namespace Impl
 
 template <int MDRangeRank>
-struct MDRangeTuner {
+struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
  private:
   static constexpr int rank       = MDRangeRank;
   static constexpr int max_slices = 15;
@@ -548,8 +611,45 @@ struct MDRangeTuner {
       tuner.end();
     }
   }
+
+  TunerType get_tuner() const { return tuner; }
 };
 
+template <class Choice>
+struct CategoricalTuner {
+  using choice_list = std::vector<Choice>;
+  choice_list choices;
+  size_t context;
+  size_t tuning_variable_id;
+  CategoricalTuner(std::string name, choice_list m_choices)
+      : choices(m_choices) {
+    std::vector<int64_t> indices;
+    for (typename decltype(choices)::size_type x = 0; x < choices.size(); ++x) {
+      indices.push_back(x);
+    }
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_set;
+    info.type          = ValueType::kokkos_value_int64;
+    info.candidates    = make_candidate_set(indices.size(), indices.data());
+    tuning_variable_id = declare_output_type(name, info);
+  }
+  const Choice& begin() {
+    context = get_new_context_id();
+    begin_context(context);
+    VariableValue value = make_variable_value(tuning_variable_id, int64_t(0));
+    request_output_values(context, 1, &value);
+    return choices[value.value.int_value];
+  }
+  void end() { end_context(context); }
+};
+
+template <typename Choice>
+auto make_categorical_tuner(std::string name, std::vector<Choice> choices)
+    -> CategoricalTuner<Choice> {
+  return CategoricalTuner<Choice>(name, choices);
+}
+
 }  // namespace Experimental
 }  // namespace Tools
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index 1abe0a48df..b217cc4bc1 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -190,9 +190,9 @@ struct ViewTraits<void, void, Prop...> {
 };
 
 template <class ArrayLayout, class... Prop>
-struct ViewTraits<typename std::enable_if<
-                      Kokkos::Impl::is_array_layout<ArrayLayout>::value>::type,
-                  ArrayLayout, Prop...> {
+struct ViewTraits<
+    typename std::enable_if<Kokkos::is_array_layout<ArrayLayout>::value>::type,
+    ArrayLayout, Prop...> {
   // Specify layout, keep subsequent space and memory traits arguments
 
   using execution_space = typename ViewTraits<void, Prop...>::execution_space;
@@ -204,9 +204,8 @@ struct ViewTraits<typename std::enable_if<
 };
 
 template <class Space, class... Prop>
-struct ViewTraits<
-    typename std::enable_if<Kokkos::Impl::is_space<Space>::value>::type, Space,
-    Prop...> {
+struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
+                  Space, Prop...> {
   // Specify Space, memory traits should be the only subsequent argument.
 
   static_assert(
@@ -230,8 +229,8 @@ struct ViewTraits<
 };
 
 template <class MemoryTraits, class... Prop>
-struct ViewTraits<typename std::enable_if<Kokkos::Impl::is_memory_traits<
-                      MemoryTraits>::value>::type,
+struct ViewTraits<typename std::enable_if<
+                      Kokkos::is_memory_traits<MemoryTraits>::value>::type,
                   MemoryTraits, Prop...> {
   // Specify memory trait, should not be any subsequent arguments
 
@@ -1543,7 +1542,8 @@ class View : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from using Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence before allocating UVM");
     }
 #endif
     //------------------------------------------------------------
@@ -1555,7 +1555,8 @@ class View : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence after allocating UVM");
     }
 #endif
     //------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index bdc8993c39..dbb557c137 100644
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -213,7 +213,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_queue.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph init");
     }
 
     {  // execute-after counts
@@ -221,7 +223,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.entries.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph count");
     }
 
     {  // Scheduling ready tasks
@@ -229,7 +233,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.numRows()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "readied graph");
     }
   }
 };
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index e530612a57..0d521479ee 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -447,7 +447,13 @@ OpenMP OpenMP::create_instance(...) { return OpenMP(); }
 
 int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
 
-void OpenMP::fence() const {}
+void OpenMP::fence() const {
+  fence("Kokkos::OpenMP::fence: Unnamed Instance Fence");
+}
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
 
 namespace Impl {
 
@@ -474,6 +480,9 @@ void OpenMPSpaceInitializer::finalize(const bool) {
 }
 
 void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); }
+void OpenMPSpaceInitializer::fence(const std::string &name) {
+  Kokkos::OpenMP::impl_static_fence(OpenMP(), name);
+}
 
 void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
                                                  const bool detail) {
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 82f049ed13..1191e49cbe 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -151,7 +151,14 @@ int OpenMP::impl_thread_pool_rank() noexcept {
 #endif
 }
 
-inline void OpenMP::impl_static_fence(OpenMP const& /*instance*/) noexcept {}
+inline void OpenMP::impl_static_fence(OpenMP const& /**instance*/,
+                                      const std::string& name) noexcept {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {});
+}
 
 inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
   return false;
@@ -213,8 +220,9 @@ void OpenMP::partition_master(F const& f, int num_partitions,
 
 namespace Experimental {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <>
-class MasterLock<OpenMP> {
+class KOKKOS_DEPRECATED MasterLock<OpenMP> {
  public:
   void lock() { omp_set_lock(&m_lock); }
   void unlock() { omp_unset_lock(&m_lock); }
@@ -231,6 +239,7 @@ class MasterLock<OpenMP> {
  private:
   omp_lock_t m_lock;
 };
+#endif
 
 template <>
 class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 2a4a7b1d53..d9234e3419 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -324,11 +324,6 @@ class TaskQueueSpecializationConstrained<
                 // count of 0 also. Otherwise, returns a task from another queue
                 // or `end` if one couldn't be popped
                 task = team_queue.attempt_to_steal_task();
-#if 0
-                if(task != no_more_tasks_sentinel && task != end) {
-                  std::printf("task stolen on rank %d\n", team_exec.league_rank());
-                }
-#endif
               }
 
               // If still tasks are still executing
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index f13875b440..7ff885ed86 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -77,9 +77,10 @@ namespace Kokkos {
 namespace Impl {
 
 void OpenMPTargetExec::verify_is_process(const char* const label) {
-  if (omp_in_parallel()) {
+  // Fails if the current task is in a parallel region or is not on the host.
+  if (omp_in_parallel() && (!omp_is_initial_device())) {
     std::string msg(label);
-    msg.append(" ERROR: in parallel");
+    msg.append(" ERROR: in parallel or on device");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 }
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 0b65e0d4a4..ccfc756213 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -54,7 +54,10 @@
 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
 // hierarchical reducers. It avoids hitting the code path which we wanted to
 // write but doesn't work. undef'ed at the end.
+// Intel compilers prefer the non-workaround version.
+#ifndef KOKKOS_ARCH_INTEL_GPU
 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -66,10 +69,6 @@ template <class Reducer>
 struct OpenMPTargetReducerWrapper {
   using value_type = typename Reducer::value_type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type&, const value_type&) {
     printf(
@@ -90,7 +89,6 @@ struct OpenMPTargetReducerWrapper {
         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
         "implemented.");
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -99,10 +97,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest += src; }
@@ -116,7 +110,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::sum();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -125,10 +118,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest *= src; }
@@ -142,7 +131,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::prod();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -151,10 +139,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -170,7 +154,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -179,10 +162,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -199,7 +178,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::max();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -208,10 +186,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     dest = dest && src;
@@ -226,7 +200,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::land();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -237,10 +210,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -256,7 +225,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::lor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -265,10 +233,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -284,7 +248,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::band();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -293,10 +256,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -312,7 +271,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::bor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -325,10 +283,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -345,7 +299,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::min();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -358,10 +311,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     if (src.val > dest.val) dest = src;
@@ -377,7 +326,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::max();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -389,10 +337,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
   // Required
   using value_type = MinMaxScalar<scalar_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -419,7 +363,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
     val.max_val = reduction_identity<scalar_type>::max();
     val.min_val = reduction_identity<scalar_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -432,10 +375,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = MinMaxLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -468,7 +407,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
     val.max_loc = reduction_identity<index_type>::min();
     val.min_loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 /*
 template<class ReducerType>
@@ -560,47 +498,20 @@ class OpenMPTargetExecTeamMember {
   void* m_glb_scratch;
   void* m_reduce_scratch;
 
-  /*
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
-  )->state_set( Active ); memory_fence();
-      }
-    }
-  */
  public:
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_shmem() const {
     return m_team_shared.set_team_thread_mode(0, 1, 0);
   }
 
+  // set_team_thread_mode routine parameters for future understanding:
+  // first parameter - scratch level.
+  // second parameter - size multiplier for advancing scratch ptr after a
+  // request was serviced. third parameter - offset size multiplier from current
+  // scratch ptr when returning a ptr for a request.
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, 1,
-                                              m_team_scratch_size[level]);
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -627,8 +538,9 @@ class OpenMPTargetExecTeamMember {
     using type =
         typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
                                   ValueType, void>::type;
-    type* team_scratch = reinterpret_cast<type*>(
-        ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
 #pragma omp barrier
     if (team_rank() == thread_id) *team_scratch = value;
 #pragma omp barrier
@@ -656,7 +568,8 @@ class OpenMPTargetExecTeamMember {
 
     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
     type* team_scratch =
-        (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
     for (int i = m_team_rank; i < n_values; i += m_team_size) {
       team_scratch[i] = value_type();
     }
@@ -770,27 +683,24 @@ class OpenMPTargetExecTeamMember {
         m_shmem_block_index(shmem_block_index),
         m_glb_scratch(glb_scratch) {
     const int omp_tid = omp_get_thread_num();
-    m_team_shared     = scratch_memory_space(
-        ((char*)glb_scratch +
-         m_shmem_block_index *
-             (shmem_size_L0 + shmem_size_L1 +
-              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
-        shmem_size_L0,
-        ((char*)glb_scratch +
-         m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
-                                ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-                                TEAM_REDUCE_SIZE)) +
-            shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-            TEAM_REDUCE_SIZE,
-        shmem_size_L1);
-    m_reduce_scratch =
-        (char*)glb_scratch +
-        shmem_block_index *
-            (shmem_size_L0 + shmem_size_L1 +
-             ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
-    m_league_rank = league_rank;
-    m_team_rank   = omp_tid;
-    m_vector_lane = 0;
+
+    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
+    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
+    // hierarchical reduction. There is an additional 10% of the requested
+    // scratch memory allocated per team as padding. Hence the product with 0.1.
+    const int reduce_offset =
+        m_shmem_block_index *
+        (shmem_size_L0 + shmem_size_L1 +
+         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    const int l1_offset = l0_offset + shmem_size_L0;
+    m_team_shared       = scratch_memory_space(
+        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
+        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
+    m_league_rank    = league_rank;
+    m_team_rank      = omp_tid;
+    m_vector_lane    = 0;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
@@ -877,6 +787,16 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
   friend class TeamPolicyInternal;
 
  public:
+  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
+  // implementation, but this has to be tailored to be architecture specific.
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
+  }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline void impl_set_team_size(const size_t size) { m_team_size = size; }
@@ -884,9 +804,11 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
     m_tune_vector_length = length;
   }
   inline int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
   inline size_t scratch_size(const int& level, int team_size_ = -1) const {
@@ -1245,21 +1167,12 @@ KOKKOS_INLINE_FUNCTION
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
 
 #pragma omp barrier
-  // These three lines all cause crash
   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
-//  result.init(TeamThread_scratch[0]);
-//  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
-//  red.init(TeamThread_scratch[0]);
 #pragma omp barrier
 
 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp;
-    result.init(tmp);
-    lambda(i, tmp);
-    // This line causes a crash
-    Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
-                                                        tmp);
+    lambda(i, TeamThread_scratch[0]);
   }
   result.reference() = TeamThread_scratch[0];
 }
@@ -1305,6 +1218,12 @@ KOKKOS_INLINE_FUNCTION
          i += team_size) {
       lambda(i, tmp2);
     }
+
+    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
+    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
+    // operation is performed by OpenMP itself and hence the simple assignment
+    // works.
+    //    result.join(TeamThread_scratch[0], tmp2);
     TeamThread_scratch[0] = tmp2;
   }
 
@@ -1336,28 +1255,31 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
   static_assert(sizeof(ValueType) <=
                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
 
+  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
+  const int value_count = 1;
+
 #pragma omp barrier
   TeamThread_scratch[0] = init_result;
 #pragma omp barrier
 
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for reduction(+ : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for reduction(custom : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      join(TeamThread_scratch[0], tmp);
-    }
+#pragma omp for
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
   }
 
+  // Reduce all partial results within a team.
+  const int team_size      = omp_get_num_threads();
+  int tree_neighbor_offset = 1;
+  do {
+#pragma omp for
+    for (int i = 0; i < team_size - tree_neighbor_offset;
+         i += 2 * tree_neighbor_offset) {
+      const int neighbor = i + tree_neighbor_offset;
+      join(lambda, &TeamThread_scratch[i * value_count],
+           &TeamThread_scratch[neighbor * value_count]);
+    }
+    tree_neighbor_offset *= 2;
+  } while (tree_neighbor_offset < team_size);
   init_result = TeamThread_scratch[0];
 }
 
@@ -1402,7 +1324,6 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 }
 
 }  // namespace Kokkos
-#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 namespace Kokkos {
 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
@@ -1530,8 +1451,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
     lambda(i, scan_val, true);
   }
 }
@@ -1629,9 +1549,7 @@ KOKKOS_INLINE_FUNCTION
 
 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    TeamVector_scratch[0] += tmp;
+    lambda(i, TeamVector_scratch[0]);
   }
 
   result.reference() = TeamVector_scratch[0];
@@ -1686,7 +1604,9 @@ KOKKOS_INLINE_FUNCTION
 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 namespace Kokkos {
 
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 4a79b72732..e421edc5b4 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -59,7 +59,34 @@
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
-void OpenMPTargetInternal::fence() {}
+uint32_t OpenMPTargetInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
+
+void OpenMPTargetInternal::fence(openmp_fence_is_static is_static) {
+  fence(
+      "Kokkos::Experimental::Impl::OpenMPTargetInternal::fence: Unnamed "
+      "Internal Fence",
+      is_static);
+}
+void OpenMPTargetInternal::fence(const std::string& name,
+                                 openmp_fence_is_static is_static) {
+  if (is_static == openmp_fence_is_static::no) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+            impl_get_instance_id()},
+        [&]() {});
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        [&]() {});
+  }
+}
 int OpenMPTargetInternal::concurrency() { return 128000; }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
@@ -77,7 +104,18 @@ void OpenMPTargetInternal::impl_finalize() {
     Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
         space.m_uniquetoken_ptr);
 }
-void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; }
+void OpenMPTargetInternal::impl_initialize() {
+  m_is_initialized = true;
+
+  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
+  // from Pascal and upwards.
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+  omp_set_num_teams(512);
+#endif
+#endif
+}
 int OpenMPTargetInternal::impl_is_initialized() {
   return m_is_initialized ? 1 : 0;
 }
@@ -100,11 +138,28 @@ void OpenMPTarget::print_configuration(std::ostream& stream,
   m_space_instance->print_configuration(stream, detail);
 }
 
+uint32_t OpenMPTarget::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
 void OpenMPTarget::fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence();
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
+}
+void OpenMPTarget::fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
+}
+void OpenMPTarget::impl_static_fence() {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence",
+      Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
+}
+void OpenMPTarget::impl_static_fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
 void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); }
@@ -146,7 +201,10 @@ void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void OpenMPTargetSpaceInitializer::fence() {
-  Kokkos::Experimental::OpenMPTarget::fence();
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence();
+}
+void OpenMPTargetSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence(name);
 }
 
 void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index a1caf90c19..b495771190 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -51,6 +51,8 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+enum class openmp_fence_is_static { yes, no };
+
 class OpenMPTargetInternal {
  private:
   OpenMPTargetInternal()                            = default;
@@ -58,7 +60,9 @@ class OpenMPTargetInternal {
   OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default;
 
  public:
-  void fence();
+  void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no);
+  void fence(const std::string& name,
+             openmp_fence_is_static is_static = openmp_fence_is_static::no);
 
   /** \brief  Return the maximum amount of concurrency.  */
   int concurrency();
@@ -73,14 +77,16 @@ class OpenMPTargetInternal {
 
   //! Has been initialized
   int impl_is_initialized();
-
+  uint32_t impl_get_instance_id() const noexcept;
   //! Initialize, telling the CUDA run-time library which device to use.
   void impl_initialize();
 
   static OpenMPTargetInternal* impl_singleton();
 
  private:
-  bool m_is_initialized = false;
+  bool m_is_initialized  = false;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::OpenMPTarget>(reinterpret_cast<uintptr_t>(this));
 };
 }  // Namespace Impl
 }  // Namespace Experimental
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index a4092c3a37..08a3109408 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -51,8 +51,6 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-
 namespace Kokkos {
 namespace Impl {
 
@@ -69,24 +67,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
  public:
-  inline void execute() const { execute_impl<WorkTag>(); }
-  /*
-    template <class TagType>
-    inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-    execute_impl() const {
-      OpenMPTargetExec::verify_is_process(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      OpenMPTargetExec::verify_initialized(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      const typename Policy::member_type begin = m_policy.begin();
-      const typename Policy::member_type end   = m_policy.end();
+  void execute() const { execute_impl<WorkTag>(); }
 
-  #pragma omp target teams distribute parallel for map(to: this->m_functor)
-      for (int i = begin; i < end; i++) m_functor(i);
-    }
-  */
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -98,16 +82,17 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     FunctorType a_functor(m_functor);
 
-    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(i);
-    } else {
-#pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        a_functor(i);
+      } else {
+        a_functor(TagType(), i);
+      }
     }
   }
 
-  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 };
 
@@ -120,12 +105,31 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
+// This class has the memcpy routine that is commonly used by ParallelReduce
+// over RangePolicy and TeamPolicy.
+template <class PointerType>
+struct ParallelReduceCommon {
+  // Copy the result back to device if the view is on the device.
+  static void memcpy_result(PointerType dest, PointerType src, size_t size,
+                            bool ptr_on_device) {
+    if (ptr_on_device) {
+      OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
+    } else {
+      *dest = *src;
+    }
+  }
+};
+
 template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType, bool FunctorHasJoin,
-          bool UseReducerType>
+          class PointerType, class ValueType>
 struct ParallelReduceSpecialize {
-  static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
+  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
+    constexpr int FunctorHasJoin = ReduceFunctorHasJoin<FunctorType>::value;
+    constexpr int UseReducerType = is_reducer_type<ReducerType>::value;
+
     std::stringstream error_message;
     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
                   << UseReducerType << '\n';
@@ -137,12 +141,26 @@ struct ParallelReduceSpecialize {
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
+
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -153,69 +171,220 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(i, result);
-    } else {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-    }
 
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-
-template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType>
-struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
-                                PointerType, ValueType, false, true> {
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+    OpenMPTargetReducerWrapper<ReducerType>::init(result);
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        f(i, result);
+      } else {
+        f(TagType(), i, result);
+      }
+    }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  template <class TagType, int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
+    const auto begin = p.begin();
+    const auto end   = p.end();
 
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
 
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(i, result);
-      *result_ptr = result;
+    // Enter the loop if the reduction is on a scalar type.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams distribute parallel for \
+         map(to:f) reduction(+: result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      }
     } else {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-      *result_ptr = result;
+#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
+      for (auto i = begin; i < end; ++i) {
+        if constexpr (std::is_same<TagType, void>::value) {
+          f(i, result);
+        } else {
+          f(TagType(), i, result);
+        }
+      }
     }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    // Initialize the result pointer.
+
+    const auto size = end - begin;
+
+    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
+    // based on NVIDIA-V100 and should be modifid to be based on the
+    // architecture in the future.
+    const int max_team_threads = 32;
+    const int max_teams =
+        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread. Achieved by setting the first
+    // parameter of `resize_scratch=1`.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    ValueType* scratch_ptr =
+        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
+
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+    {
+      // Enter this loop if the functor has an `init`
+      if constexpr (HasInit) {
+        // The `init` routine needs to be called on the device since it might
+        // need device members.
+        ValueInit::init(f, scratch_ptr);
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      } else {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
+    map(to                                                                   \
+        : f) is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num    = omp_get_team_num();
+        const int num_teams   = omp_get_num_teams();
+        const auto chunk_size = size / num_teams;
+        const auto team_begin = begin + team_num * chunk_size;
+        const auto team_end =
+            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
+        ValueType* team_scratch =
+            scratch_ptr + team_num * max_team_threads * value_count;
+        ReferenceType result = ValueInit::init(
+            f, &team_scratch[omp_get_thread_num() * value_count]);
+
+        // Accumulate partial results in thread specific storage.
+#pragma omp for simd
+        for (auto i = team_begin; i < team_end; ++i) {
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+        }
+
+        // Reduce all paritial results within a team.
+        const int team_size      = max_team_threads;
+        int tree_neighbor_offset = 1;
+        do {
+#pragma omp for simd
+          for (int i = 0; i < team_size - tree_neighbor_offset;
+               i += 2 * tree_neighbor_offset) {
+            const int neighbor = i + tree_neighbor_offset;
+            ValueJoin::join(f, &team_scratch[i * value_count],
+                            &team_scratch[neighbor * value_count]);
+          }
+          tree_neighbor_offset *= 2;
+        } while (tree_neighbor_offset < team_size);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < max_teams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = max_team_threads * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= max_teams &&
+              omp_get_team_num() == 0 && omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < max_teams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -227,47 +396,77 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  // Static Assert WorkTag void if ReducerType not InvalidType
-
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
   using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+  using TagType = typename Policy::work_tag;
 
  public:
-  inline void execute() const {
-    ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      // Enter this loop if the Functor has a init-join.
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      // Enter this loop if the Functor is a reducer type.
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      // Enter this loop if the reduction is on an array and the routine is
+      // templated over the size of the array.
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<TagType, 2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<TagType, 4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<TagType, 8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<TagType, 16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<TagType, 32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      // This loop handles the basic scalar reduction.
+      ParReduceSpecialize::template execute_array<TagType, 1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
+  ParallelReduce(
+      const FunctorType& arg_functor, Policy& arg_policy,
       const ViewType& arg_result_view,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
@@ -275,14 +474,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {}
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result_view.size()) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()) {}
 };
 
 }  // namespace Impl
@@ -318,20 +526,20 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
+  typename std::enable_if<std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(idx, val, is_final);
   }
   template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  typename std::enable_if<!std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(WorkTag(), idx, val, is_final);
   }
 
  public:
-  inline void impl_execute(
+  void impl_execute(
       Kokkos::View<value_type**, Kokkos::LayoutRight,
                    Kokkos::Experimental::OpenMPTargetSpace>
           element_values,
@@ -349,13 +557,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type val;
           ValueInit::init(a_functor, &val);
@@ -366,7 +574,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (omp_get_thread_num() == 0) {
           value_type sum;
           ValueInit::init(a_functor, &sum);
-          for (idx_type i = 0; i < chunk_size; i++) {
+          for (idx_type i = 0; i < chunk_size; ++i) {
             ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
             element_values(team_id, i) = sum;
           }
@@ -377,7 +585,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
             value_type sum;
             ValueInit::init(a_functor, &sum);
-            for (idx_type i = 0; i < n_chunks; i++) {
+            for (idx_type i = 0; i < n_chunks; ++i) {
               ValueJoin::join(a_functor, &sum, &chunk_values(i));
               chunk_values(i) = sum;
             }
@@ -389,7 +597,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -400,7 +608,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           ValueInit::init(a_functor, &offset_value);
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type local_offset_value;
           if (i > 0) {
@@ -415,7 +623,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -438,7 +646,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   //----------------------------------------
@@ -455,7 +663,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   value_type& m_returnvalue;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -513,7 +721,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -523,7 +731,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  private:
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -549,7 +757,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
@@ -580,49 +787,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       } else
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
-
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute map(to                   \
-                                        : a_functor)         \
-    is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
-        thread_limit(team_size)
-    for (int i = 0; i < league_size; i++) {
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-
-#pragma omp parallel num_threads(team_size)
-      {
-        typename Policy::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        m_functor(team);
-      }
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-#endif
   }
 
  public:
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
@@ -633,13 +801,26 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -662,112 +843,10 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
-#pragma omp parallel reduction(+ : result)
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
-
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
-          typename PolicyType::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
-            f(team, result);
-          else
-            f(TagType(), team, result);
-        }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
-
-    *result_ptr = result;
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
-         map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr, lock_array)
-    for (int i = 0; i < league_size; i++) {
-      ValueType inner_result = ValueType();
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-#pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
-      {
-        typename PolicyType::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        f(team, inner_result);
-      }
-      result = inner_result;
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-
-    *result_ptr = result;
-#endif
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                true> {
-  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-    const int league_size      = p.league_size();
-    const int team_size        = p.team_size();
-    const int vector_length    = p.impl_vector_length();
-    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-
-    ValueType result = ValueType();
-
-    // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
 
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
@@ -794,12 +873,259 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
 
-    *result_ptr = result;
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  template <int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1);
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    ValueType result = ValueType();
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Case where the number of reduction items is 1.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      } else {
+        // Case where the reduction is on a non-native data type.
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(custom                             \
+                                         : result)
+#pragma omp parallel reduction(custom : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      }
+    } else {
+      // Case where the reduction is on an array.
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
+#pragma omp parallel reduction(+ : result[:NumReductions])
+      {
+        const int blockIdx = omp_get_team_num();
+        const int gridDim  = omp_get_num_teams();
+
+        // Guarantee that the compilers respect the `num_teams` clause
+        if (gridDim <= nteams) {
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_same<TagType, void>::value)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        } else
+          Kokkos::abort("`num_teams` clause was not respected.\n");
+      }
+    }
+
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
+  // RangePolicy. Need a new implementation.
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin      = p.begin();
+    const auto end        = p.end();
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    const auto size = end - begin;
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+
+    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
+    // already using the available scratch memory to create temporaries for each
+    // thread.
+    if constexpr ((shmem_size_L0 + shmem_size_L1) > 0) {
+      Kokkos::abort(
+          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
+          "over functors with init/join.");
+    }
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    // Enter this loop if the functor has an `init`
+    if constexpr (HasInit) {
+      // The `init` routine needs to be called on the device since it might need
+      // device members.
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        ValueInit::init(f, scratch_ptr);
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    } else {
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num      = omp_get_team_num();
+        const int num_teams     = omp_get_num_teams();
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
+                                  team_num * team_size * value_count;
+        ReferenceType result = ValueInit::init(f, &team_scratch[0]);
+
+        for (int league_id = team_num; league_id < league_size;
+             league_id += num_teams) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              team_num, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(team, result);
+          } else {
+            f(TagType(), team, result);
+          }
+        }
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < nteams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = team_size * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+              omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < nteams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -813,11 +1139,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
@@ -831,13 +1155,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename ValueTraits::reference_type;
   using value_type     = typename ValueTraits::value_type;
 
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
 
-  using ParForSpecialize =
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
+  using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -846,18 +1173,50 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      ParReduceSpecialize::template execute_array<1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
+  ParallelReduce(
       const FunctorType& arg_functor, const Policy& arg_policy,
       const ViewType& arg_result,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
-      : m_functor(arg_functor),
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result.size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -865,9 +1224,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
@@ -889,11 +1253,11 @@ struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const iType end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType count)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType count)
       : start(0), end(count), team(thread_) {}
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType begin_, iType end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -904,12 +1268,11 @@ struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type count)
       : start(0), end(count), team(thread_) {}
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -920,12 +1283,11 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type count)
       : start(0), end(count), team(thread_) {}
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -935,5 +1297,4 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 3dfad2bb85..40d8c45f5d 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -91,7 +91,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
 #pragma omp target teams distribute map(to : functor) num_teams(end - begin)
     {
-      for (ptrdiff_t tile_idx = begin; tile_idx < end; tile_idx++) {
+      for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) {
 
 #pragma omp parallel
         {
@@ -116,31 +116,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 1>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-
-    const auto end_0 = policy.m_upper[0];
-
-#pragma omp target teams distribute parallel for map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-#pragma omp for
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#endif
-  }
-
   template <int Rank>
   inline typename std::enable_if<Rank == 2>::type execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
@@ -154,8 +129,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_1 = policy.m_upper[1];
 
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -172,8 +147,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
 
 #pragma omp for collapse(2)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -197,9 +172,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_2 = policy.m_upper[2];
 
 #pragma omp target teams distribute parallel for collapse(3) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -221,9 +196,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
 
 #pragma omp for collapse(3)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -249,10 +224,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_3 = policy.m_upper[3];
 
 #pragma omp target teams distribute parallel for collapse(4) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -279,10 +254,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
 
 #pragma omp for collapse(4)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -310,11 +285,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_4 = policy.m_upper[4];
 
 #pragma omp target teams distribute parallel for collapse(5) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -347,11 +322,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
 
 #pragma omp for collapse(5)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -382,12 +357,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_5 = policy.m_upper[5];
 
 #pragma omp target teams distribute parallel for collapse(6) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
-              for (auto i5 = begin_5; i5 < end_5; i5++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
+              for (auto i5 = begin_5; i5 < end_5; ++i5) {
                 {
                   if constexpr (std::is_same<typename Policy::work_tag,
                                              void>::value)
@@ -428,12 +403,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
 
 #pragma omp for collapse(6)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4)
+              for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) {
                 if constexpr (std::is_same<typename Policy::work_tag,
                                            void>::value)
                   functor(i0, i1, i2, i3, i4, i5);
@@ -443,195 +418,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 7>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-
-#pragma omp target teams distribute parallel for collapse(7) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-#pragma omp for collapse(7)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-#endif
-  }
-
-  template <int Rank>
-  inline typename std::enable_if<Rank == 8>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-    const int begin_7 = policy.m_lower[7];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-    const int end_7 = policy.m_upper[7];
-
-#pragma omp target teams distribute parallel for collapse(8) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-    const ptrdiff_t begin_7 = offset[7];
-    ptrdiff_t end_7         = begin_7 + policy.m_tile[7];
-    end_7 = end_7 < policy.m_upper[7] ? end_7 : policy.m_upper[7];
-
-#pragma omp for collapse(8)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-#endif
-  }
-
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
   // TODO DZP: based on a conversation with Christian, we're using 256 as a
@@ -652,112 +438,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType,
-                                Kokkos::MDRangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
-  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(i, result);
-
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(TagType(), i, result);
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-/*
-template<class FunctorType, class PolicyType, class ReducerType, class
-PointerType, class ValueType> struct ParallelReduceSpecialize<FunctorType,
-PolicyType, ReducerType, PointerType, ValueType, 0,1> {
-
-  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out,
-omp_in)) initializer ( ReducerType::init(omp_priv) )
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom:result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(i,result);
-
-      *result_ptr=result;
-    }
-
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom: result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(TagType(),i,result);
-
-      *result_ptr=result;
-    }
-
-
-    inline static
-    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
-      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
-    }
-};
-
-
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::OpenMPTarget> {
@@ -765,42 +445,38 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::MDRangePolicy<Traits...>;
 
   using WorkTag = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member = typename Policy::member_type;
+  using Member  = typename Policy::member_type;
 
   using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
+      std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                       FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                               WorkTag, void>::type;
-
-  // Static Assert WorkTag void if ReducerType not InvalidType
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using reference_type = typename ValueTraits::reference_type;
 
   enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
   enum { UseReducer = is_reducer_type<ReducerType>::value };
 
-  using pointer_type = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  using ParForSpecialize = ParallelReduceSpecialize<
-      FunctorType, Policy, ReducerType, pointer_type,
-      typename ValueTraits::value_type, HasJoin, UseReducer>;
-
+  const pointer_type m_result_ptr;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
+
+  using ParReduceCommon = ParallelReduceCommon<pointer_type>;
+
+  bool m_result_ptr_on_device;
 
  public:
   inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+    execute_tile<Policy::rank, typename ValueTraits::value_type>(
+        m_functor, m_policy, m_result_ptr);
   }
 
   template <class ViewType>
@@ -810,35 +486,345 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = NULL)
-      : m_functor(arg_functor),
+      : m_result_ptr(arg_result_view.data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
-  }
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible) {}
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_functor(arg_functor),
+      : m_result_ptr(reducer.view().data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 2>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(2) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
   }
-  // TODO DZP: based on a conversation with Christian, we're using 256 as a
-heuristic
-  // here. We need something better once we can query these kinds of properties
-  template<typename Policy, typename Functor>
-static int max_tile_size_product(const Policy&, const Functor&) {
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 3>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(3) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 4>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[3];
+    const auto begin_3 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(4) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 5>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(5) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 6>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+    const auto begin_5 = policy.m_lower[5];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+    const auto end_5 = policy.m_upper[5];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(6) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
     return 256;
   }
-};*/
+};
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
index be924ffa61..0e71a239ca 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -112,35 +112,11 @@ void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const {
   // This team member sets one byte within the sync variable
   int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank;
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   *sync_self = int8_t(m_sync_value & 0x03);  // signal arrival
 
   while (m_sync_value != *sync)
     ;  // wait for team to arrive
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   ++m_sync_step;
 
   if (0 == (0x01 & m_sync_step)) {  // Every other step
@@ -222,17 +198,6 @@ void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute(
         task = *task_shared;
       }
 
-#if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
-       );
-fflush(stdout);
-#endif
-
       if (0 == task) break;  // 0 == m_ready_count
 
       if (end == task) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 3a09ee9195..18d33317a2 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -112,14 +112,36 @@ void SYCL::print_configuration(std::ostream& s, const bool detailed) {
 }
 
 void SYCL::fence() const {
-  Impl::SYCLInternal::fence(*m_space_instance->m_queue);
+  fence("Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::fence(const std::string& name) const {
+  Impl::SYCLInternal::fence(*m_space_instance->m_queue, name,
+                            impl_instance_id());
 }
 
 void SYCL::impl_static_fence() {
-  // guard accessing all_queues
-  std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
-  for (auto& queue : Impl::SYCLInternal::all_queues)
-    Impl::SYCLInternal::fence(**queue);
+  impl_static_fence(
+      "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() {
+        // guard accessing all_queues
+        std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
+        for (auto& queue : Impl::SYCLInternal::all_queues) {
+          try {
+            (*queue)->wait_and_throw();
+          } catch (sycl::exception const& e) {
+            Kokkos::Impl::throw_runtime_exception(
+                std::string("There was a synchronous SYCL error:\n") +=
+                e.what());
+          }
+        }
+      });
 }
 
 int SYCL::sycl_device() const {
@@ -224,10 +246,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << device.get_info<device::global_mem_cache_size>()
             << "\nGlobal Mem Size: "
             << device.get_info<device::global_mem_size>()
-            << "\nMax Constant Buffer Size: "
-            << device.get_info<device::max_constant_buffer_size>()
-            << "\nMax Constant Args: "
-            << device.get_info<device::max_constant_args>()
             << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
             << device.get_info<device::error_correction_support>()
@@ -296,6 +314,9 @@ void SYCLSpaceInitializer::finalize(const bool all_spaces) {
 void SYCLSpaceInitializer::fence() {
   Kokkos::Experimental::SYCL::impl_static_fence();
 }
+void SYCLSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::SYCL::impl_static_fence(name);
+}
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
                                                const bool detail) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index aef65ee7ec..3eeab56363 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -48,181 +48,144 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_SYCL.hpp>
 
+#include <vector>
+
 #ifdef KOKKOS_ENABLE_SYCL
 
 namespace Kokkos {
 namespace Impl {
 
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::SYCL, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
+    exec_space.impl_internal_space_instance()->m_queue->submit_barrier(
+        std::vector<sycl::event>{event});
   }
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
+void DeepCopySYCL(void* dst, const void* src, size_t n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n);
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace1>::value &&
+                                 is_sycl_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace1>::value &&
+        is_sycl_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
+
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
+
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      ExecutionSpace> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 5a702b5027..31c5bc449a 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -42,14 +42,7 @@
 //@HEADER
 */
 
-#include <Kokkos_Concepts.hpp>
-#include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <KokkosCore_Config_DeclareBackend.hpp>
-#include <Kokkos_SYCL.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Serial.hpp>
-#include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Core.hpp>  //kokkos_malloc
 
 namespace Kokkos {
 namespace Experimental {
@@ -95,7 +88,11 @@ void SYCLInternal::initialize(const sycl::device& d) {
       Kokkos::Impl::throw_runtime_exception(
           "There was an asynchronous SYCL error!\n");
   };
-  initialize(sycl::queue{d, exception_handler});
+  // FIXME_SYCL using an in-order queue here should not be necessary since we
+  // are using submit_barrier for managing kernel dependencies but this seems to
+  // be required as a hot fix for now.
+  initialize(
+      sycl::queue{d, exception_handler, sycl::property::queue::in_order()});
 }
 
 // FIXME_SYCL
@@ -122,7 +119,6 @@ void SYCLInternal::initialize(const sycl::queue& q) {
       all_queues.push_back(&m_queue);
     }
     const sycl::device& d = m_queue->get_device();
-    std::cout << SYCL::SYCLDevice(d) << '\n';
 
     m_maxWorkgroupSize =
         d.template get_info<sycl::info::device::max_work_group_size>();
@@ -140,19 +136,22 @@ void SYCLInternal::initialize(const sycl::queue& q) {
           Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
       Record* const r =
           Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                           "Kokkos::SYCL::InternalScratchBitset",
+                           "Kokkos::Experimental::SYCL::InternalScratchBitset",
                            sizeof(uint32_t) * buffer_bound);
       Record::increment(r);
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data());
       auto event                = m_queue->memset(m_scratchConcurrentBitset, 0,
                                    sizeof(uint32_t) * buffer_bound);
-      fence(event);
+      fence(event,
+            "Kokkos::Experimental::SYCLInternal::initialize: fence after "
+            "initializing m_scratchConcurrentBitset",
+            m_instance_id);
     }
 
     m_maxShmemPerBlock =
         d.template get_info<sycl::info::device::local_mem_size>();
-    m_indirectKernelMem.reset(*m_queue);
-    m_indirectReducerMem.reset(*m_queue);
+    m_indirectKernelMem.reset(*m_queue, m_instance_id);
+    m_indirectReducerMem.reset(*m_queue, m_instance_id);
   } else {
     std::ostringstream msg;
     msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
@@ -162,10 +161,36 @@ void SYCLInternal::initialize(const sycl::queue& q) {
     }
     Kokkos::Impl::throw_runtime_exception(msg.str());
   }
+
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
 }
 
+void* SYCLInternal::resize_team_scratch_space(std::int64_t bytes,
+                                              bool force_shrink) {
+  if (m_team_scratch_current_size == 0) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>(
+            "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory",
+            m_team_scratch_current_size);
+  }
+  if ((bytes > m_team_scratch_current_size) ||
+      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>(
+            m_team_scratch_ptr, m_team_scratch_current_size);
+  }
+  return m_team_scratch_ptr;
+}
+
+uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; }
+
 void SYCLInternal::finalize() {
-  SYCL().fence();
+  SYCLInternal::fence(*m_queue,
+                      "Kokkos::SYCLInternal::finalize: fence on finalization",
+                      m_instance_id);
   was_finalized = true;
 
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
@@ -182,6 +207,12 @@ void SYCLInternal::finalize() {
   RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset));
   m_scratchConcurrentBitset = nullptr;
 
+  if (m_team_scratch_current_size > 0)
+    Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+        m_team_scratch_ptr);
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+
   m_indirectKernelMem.reset();
   m_indirectReducerMem.reset();
   // guard erasing from all_queues
@@ -208,7 +239,7 @@ void* SYCLInternal::scratch_space(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchSpace",
+                         "Kokkos::Experimental::SYCL::InternalScratchSpace",
                          (sizeScratchGrain * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -235,7 +266,7 @@ void* SYCLInternal::scratch_flags(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchFlags",
+                         "Kokkos::Experimental::SYCL::InternalScratchFlags",
                          (sizeScratchGrain * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -243,14 +274,38 @@ void* SYCLInternal::scratch_flags(
     m_scratchFlags = reinterpret_cast<size_type*>(r->data());
   }
   m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
-  fence(*m_queue);
+  fence(*m_queue,
+        "Kokkos::Experimental::SYCLInternal::scratch_flags fence after "
+        "initializing m_scratchFlags",
+        m_instance_id);
 
   return m_scratchFlags;
 }
 
+template <typename WAT>
+void SYCLInternal::fence_helper(WAT& wat, const std::string& name,
+                                uint32_t instance_id) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id},
+      [&]() {
+        try {
+          wat.wait_and_throw();
+        } catch (sycl::exception const& e) {
+          Kokkos::Impl::throw_runtime_exception(
+              std::string("There was a synchronous SYCL error:\n") += e.what());
+        }
+      });
+}
+template void SYCLInternal::fence_helper<sycl::queue>(sycl::queue&,
+                                                      const std::string&,
+                                                      uint32_t);
+template void SYCLInternal::fence_helper<sycl::event>(sycl::event&,
+                                                      const std::string&,
+                                                      uint32_t);
+
 template <sycl::usm::alloc Kind>
 size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
-  assert(m_size == 0);
   assert(m_q);
 
   if (m_capacity < n) {
@@ -258,8 +313,8 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
     // First free what we have (in case malloc can reuse it)
     if (m_data) Record::decrement(Record::get_record(m_data));
 
-    Record* const r = Record::allocate(AllocationSpace(*m_q),
-                                       "Kokkos::SYCL::USMObjectMem", n);
+    Record* const r = Record::allocate(
+        AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n);
     Record::increment(r);
 
     m_data     = r->data();
@@ -271,9 +326,9 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
 
 template <sycl::usm::alloc Kind>
 void SYCLInternal::USMObjectMem<Kind>::reset() {
-  assert(m_size == 0);
-
   if (m_data) {
+    // This implies a fence since this class is not copyable
+    // and deallocating implies a fence across all registered queues.
     using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
     Record::decrement(Record::get_record(m_data));
 
@@ -285,6 +340,7 @@ void SYCLInternal::USMObjectMem<Kind>::reset() {
 
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>;
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>;
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::host>;
 
 }  // namespace Impl
 }  // namespace Experimental
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index e797411cd4..bf4d6c5b45 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -49,7 +49,7 @@
 #include <CL/sycl.hpp>
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -68,7 +68,10 @@ class SYCLInternal {
 
   void* scratch_space(const size_type size);
   void* scratch_flags(const size_type size);
+  void* resize_team_scratch_space(std::int64_t bytes,
+                                  bool force_shrink = false);
 
+  uint32_t impl_get_instance_id() const;
   int m_syclDev = -1;
 
   size_t m_maxWorkgroupSize   = 0;
@@ -81,6 +84,11 @@ class SYCLInternal {
   size_type m_scratchFlagsCount       = 0;
   size_type* m_scratchFlags           = nullptr;
 
+  int64_t m_team_scratch_current_size = 0;
+  void* m_team_scratch_ptr            = nullptr;
+
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::SYCL>(reinterpret_cast<uintptr_t>(this));
   std::optional<sycl::queue> m_queue;
 
   // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
@@ -94,40 +102,16 @@ class SYCLInternal {
   template <sycl::usm::alloc Kind>
   class USMObjectMem {
    public:
-    class Deleter {
-     public:
-      Deleter() = default;
-      explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
-
-      template <typename T>
-      void operator()(T* p) const noexcept {
-        assert(m_mem);
-        assert(sizeof(T) == m_mem->size());
-
-        if constexpr (sycl::usm::alloc::device == kind)
-          // Only skipping the dtor on trivially copyable types
-          static_assert(std::is_trivially_copyable_v<T>);
-        else
-          p->~T();
-
-        m_mem->m_size = 0;
-      }
-
-     private:
-      USMObjectMem* m_mem = nullptr;
-    };
-
-    static constexpr sycl::usm::alloc kind = Kind;
-
     void reset();
 
-    void reset(sycl::queue q) {
+    void reset(sycl::queue q, uint32_t instance_id) {
+      m_instance_id = instance_id;
       reset();
       m_q.emplace(std::move(q));
     }
-
     USMObjectMem() = default;
-    explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
+    explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept
+        : m_q(std::move(q)), m_instance_id(instance_id) {}
 
     USMObjectMem(USMObjectMem const&) = delete;
     USMObjectMem(USMObjectMem&&)      = delete;
@@ -139,7 +123,6 @@ class SYCLInternal {
     void* data() noexcept { return m_data; }
     const void* data() const noexcept { return m_data; }
 
-    size_t size() const noexcept { return m_size; }
     size_t capacity() const noexcept { return m_capacity; }
 
     // reserve() allocates space for at least n bytes
@@ -147,120 +130,68 @@ class SYCLInternal {
     size_t reserve(size_t n);
 
    private:
-    using AllocationSpace =
-        std::conditional_t<Kind == sycl::usm::alloc::device,
-                           Kokkos::Experimental::SYCLDeviceUSMSpace,
-                           Kokkos::Experimental::SYCLSharedUSMSpace>;
-
-    // This will memcpy an object T into memory held by this object
-    // returns: a T* to that object
-    //
-    // Note:  it is UB to dereference this pointer with an object that is
-    // not an implicit-lifetime nor trivially-copyable type, but presumably much
-    // faster because we can use USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
-      reserve(sizeof(T));
-      sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
-      fence(memcopied);
-
-      std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
-                                      Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
-
-    // This will copy-constuct an object T into memory held by this object
-    // returns: a unique_ptr<T, destruct_delete> that will call the
-    // destructor on the type when it goes out of scope.
-    //
-    // Note:  This will not work with USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot copy construct into USM device memory");
-
-      reserve(sizeof(T));
-
-      std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
+    using AllocationSpace = std::conditional_t<
+        Kind == sycl::usm::alloc::device,
+        Kokkos::Experimental::SYCLDeviceUSMSpace,
+        std::conditional_t<Kind == sycl::usm::alloc::shared,
+                           Kokkos::Experimental::SYCLSharedUSMSpace,
+                           Kokkos::Experimental::SYCLHostUSMSpace>>;
 
    public:
-    // Performs either memcpy (for USM device memory) and returns a T*
-    // (but is technically UB when dereferenced on an object that is not
-    // an implicit-lifetime nor trivially-copyable type
-    //
-    // or
-    //
-    // performs copy construction (for other USM memory types) and returns a
-    // unique_ptr<T, ...>
+    // Performs either sycl::memcpy (for USM device memory) or std::memcpy
+    // (otherwise) and returns a reference to the copied object.
     template <typename T>
-    std::unique_ptr<T, Deleter> copy_from(const T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_from(t);
-      else
-        return copy_construct_from(t);
+    T& copy_from(const T& t) {
+      fence();
+      reserve(sizeof(T));
+      if constexpr (sycl::usm::alloc::device == Kind) {
+        sycl::event memcopied =
+            m_q->memcpy(m_data, std::addressof(t), sizeof(T));
+        SYCLInternal::fence(
+            memcopied,
+            "Kokkos::Experimental::SYCLInternal::USMObject fence after copy",
+            m_instance_id);
+      } else
+        std::memcpy(m_data, std::addressof(t), sizeof(T));
+      return *reinterpret_cast<T*>(m_data);
     }
 
-   private:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& memcpy_to(T& t) {
-      assert(sizeof(T) == m_size);
-
-      sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
-      fence(memcopied);
-
-      return t;
+    void fence() {
+      SYCLInternal::fence(
+          m_last_event,
+          "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for "
+          "last event to finish",
+          m_instance_id);
     }
 
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& move_assign_to(T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot move_assign_to from USM device memory");
-
-      assert(sizeof(T) == m_size);
-
-      t = std::move(*static_cast<T*>(m_data));
-
-      return t;
-    }
-
-   public:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& transfer_to(T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_to(t);
-      else
-        return move_assign_to(t);
+    void register_event(sycl::event event) {
+      assert(m_last_event
+                 .get_info<sycl::info::event::command_execution_status>() ==
+             sycl::info::event_command_status::complete);
+      m_last_event = event;
     }
 
    private:
     // USMObjectMem class invariants
     // All four expressions below must evaluate to true:
     //
-    //  !m_data == !m_capacity
-    //  m_q || !m_data
-    //  m_data || !m_size
-    //  m_size <= m_capacity
+    //  !m_data == (m_capacity == 0)
+    //      m_q || !m_data
     //
     //  The above invariants mean that:
-    //  if m_size != 0 then m_data != 0
-    //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
-    //  if m_data == 0 then m_capacity == 0
+    //  if m_data != nullptr then m_capacity != 0 && m_q != nullopt
+    //  if m_data == nullptr then m_capacity == 0
 
     std::optional<sycl::queue> m_q;
     void* m_data      = nullptr;
-    size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
     size_t m_capacity = 0;
+    sycl::event m_last_event;
+
+    uint32_t m_instance_id;
   };
 
   // An indirect kernel is one where the functor to be executed is explicitly
-  // copied to USM device memory before being executed, to get around the
+  // copied to USM memory before being executed, to get around the
   // trivially copyable limitation of SYCL.
   using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
   IndirectKernelMem m_indirectKernelMem;
@@ -286,18 +217,18 @@ class SYCLInternal {
   // fence(...) takes any type with a .wait_and_throw() method
   // (sycl::event and sycl::queue)
   template <typename WAT>
-  static void fence_helper(WAT& wat) {
-    try {
-      wat.wait_and_throw();
-    } catch (sycl::exception const& e) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("There was a synchronous SYCL error:\n") += e.what());
-    }
-  }
+  static void fence_helper(WAT& wat, const std::string& name,
+                           uint32_t instance_id);
 
  public:
-  static void fence(sycl::queue& q) { fence_helper(q); }
-  static void fence(sycl::event& e) { fence_helper(e); }
+  static void fence(sycl::queue& q, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(q, name, instance_id);
+  }
+  static void fence(sycl::event& e, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(e, name, instance_id);
+  }
 };
 
 template <typename Functor, typename Storage,
@@ -312,20 +243,24 @@ class SYCLFunctionWrapper<Functor, Storage, true> {
   SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
 
   const Functor& get_functor() const { return m_functor; }
+
+  static void register_event(Storage&, sycl::event){};
 };
 
 template <typename Functor, typename Storage>
 class SYCLFunctionWrapper<Functor, Storage, false> {
-  std::unique_ptr<Functor,
-                  Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
-      m_kernelFunctorPtr;
+  const Functor& m_kernelFunctor;
 
  public:
   SYCLFunctionWrapper(const Functor& functor, Storage& storage)
-      : m_kernelFunctorPtr(storage.copy_from(functor)) {}
+      : m_kernelFunctor(storage.copy_from(functor)) {}
 
   std::reference_wrapper<const Functor> get_functor() const {
-    return {*m_kernelFunctorPtr};
+    return {m_kernelFunctor};
+  }
+
+  static void register_event(Storage& storage, sycl::event event) {
+    storage.register_event(event);
   }
 };
 
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index a286169c45..dca73683c3 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -47,11 +47,13 @@
 
 #include <impl/KokkosExp_IterateTileGPU.hpp>
 
-template <class FunctorType, class ExecPolicy>
-class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
+#include <vector>
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
                                 Kokkos::Experimental::SYCL> {
  public:
-  using Policy = ExecPolicy;
+  using Policy = Kokkos::RangePolicy<Traits...>;
 
  private:
   using Member       = typename Policy::member_type;
@@ -62,16 +64,15 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
   const Policy m_policy;
 
   template <typename Functor>
-  static void sycl_direct_launch(const Policy& policy, const Functor& functor) {
+  static sycl::event sycl_direct_launch(const Policy& policy,
+                                        const Functor& functor) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    space.fence();
-
-    q.submit([functor, policy](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([functor, policy](sycl::handler& cgh) {
       sycl::range<1> range(policy.end() - policy.begin());
       const auto begin = policy.begin();
 
@@ -83,8 +84,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
           functor(WorkTag(), id);
       });
     });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -100,7 +102,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
@@ -201,41 +205,48 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    m_space.fence();
-
-    if (m_policy.m_num_tiles == 0) return;
+    if (m_policy.m_num_tiles == 0) return {};
 
     const BarePolicy bare_policy(m_policy);
 
-    q.submit([functor, this, bare_policy](sycl::handler& cgh) {
-      const auto range = compute_ranges();
+    auto parallel_for_event =
+        q.submit([functor, this, bare_policy](sycl::handler& cgh) {
+          const auto range                  = compute_ranges();
+          const sycl::range<3> global_range = range.get_global_range();
+          const sycl::range<3> local_range  = range.get_local_range();
+          const sycl::nd_range sycl_swapped_range{
+              sycl::range<3>{global_range[2], global_range[1], global_range[0]},
+              sycl::range<3>{local_range[2], local_range[1], local_range[0]}};
 
-      cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) {
-        const index_type local_x    = item.get_local_id(0);
-        const index_type local_y    = item.get_local_id(1);
-        const index_type local_z    = item.get_local_id(2);
-        const index_type global_x   = item.get_group(0);
-        const index_type global_y   = item.get_group(1);
-        const index_type global_z   = item.get_group(2);
-        const index_type n_global_x = item.get_group_range(0);
-        const index_type n_global_y = item.get_group_range(1);
-        const index_type n_global_z = item.get_group_range(2);
+          cgh.parallel_for(sycl_swapped_range, [functor, bare_policy](
+                                                   sycl::nd_item<3> item) {
+            // swap back for correct index calculations in DeviceIterateTile
+            const index_type local_x    = item.get_local_id(2);
+            const index_type local_y    = item.get_local_id(1);
+            const index_type local_z    = item.get_local_id(0);
+            const index_type global_x   = item.get_group(2);
+            const index_type global_y   = item.get_group(1);
+            const index_type global_z   = item.get_group(0);
+            const index_type n_global_x = item.get_group_range(2);
+            const index_type n_global_y = item.get_group_range(1);
+            const index_type n_global_z = item.get_group_range(0);
 
-        Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
-                                        typename Policy::work_tag>(
-            bare_policy, functor, {n_global_x, n_global_y, n_global_z},
-            {global_x, global_y, global_z}, {local_x, local_y, local_z})
-            .exec_range();
-      });
-    });
+            Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
+                                            typename Policy::work_tag>(
+                bare_policy, functor, {n_global_x, n_global_y, n_global_z},
+                {global_x, global_y, global_z}, {local_x, local_y, local_z})
+                .exec_range();
+          });
+        });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    m_space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -253,7 +264,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index 03b7753f8e..75237b4c72 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -46,14 +46,99 @@
 #define KOKKOS_SYCL_PARALLEL_REDUCE_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
+#include <Kokkos_Parallel_Reduce.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
+namespace SYCLReduction {
+template <class ValueJoin, class ValueOps, typename WorkTag, typename ValueType,
+          typename ReducerType, typename FunctorType, int dim>
+void workgroup_reduction(sycl::nd_item<dim>& item,
+                         sycl::local_ptr<ValueType> local_mem,
+                         ValueType* results_ptr,
+                         ValueType* device_accessible_result_ptr,
+                         const unsigned int value_count,
+                         const ReducerType& selected_reducer,
+                         const FunctorType& functor, bool final) {
+  const auto local_id = item.get_local_linear_id();
+  // FIXME_SYCL should be item.get_group().get_local_linear_range();
+  size_t wgroup_size = 1;
+  for (unsigned int i = 0; i < dim; ++i) wgroup_size *= item.get_local_range(i);
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg                = item.get_sub_group();
+  auto* result           = &local_mem[local_id * value_count];
+  const auto id_in_sg    = sg.get_local_id()[0];
+  const auto local_range = std::min(sg.get_local_range()[0], wgroup_size);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    if (id_in_sg + stride < local_range)
+      ValueJoin::join(selected_reducer, result,
+                      &local_mem[(local_id + stride) * value_count]);
+    sg.barrier();
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Copy the subgroup results into the first positions of the
+  // reduction array.
+  if (id_in_sg == 0)
+    ValueOps::copy(functor, &local_mem[sg.get_group_id()[0] * value_count],
+                   result);
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    const auto n_subgroups = sg.get_group_range()[0];
+    auto* result_          = &local_mem[id_in_sg * value_count];
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    for (unsigned int offset = local_range; offset < n_subgroups;
+         offset += local_range)
+      if (id_in_sg + offset < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + offset) * value_count]);
+    sg.barrier();
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      if (id_in_sg + stride < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + stride) * value_count]);
+      sg.barrier();
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, WorkTag>::final(functor, &local_mem[0]);
+        if (device_accessible_result_ptr != nullptr)
+          ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                         &local_mem[0]);
+        else
+          ValueOps::copy(functor, &results_ptr[0], &local_mem[0]);
+      } else
+        ValueOps::copy(functor,
+                       &results_ptr[(item.get_group_linear_id()) * value_count],
+                       &local_mem[0]);
+    }
+  }
+}
+
+}  // namespace SYCLReduction
+
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::SYCL> {
@@ -76,19 +161,29 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
       : m_functor(f),
         m_policy(p),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -121,18 +216,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         const auto begin = policy.begin();
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
@@ -149,9 +244,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr != nullptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -163,7 +262,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
                         wgroup_size - 1) /
                        wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -217,49 +316,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
-
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1)
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor), n_wgroups <= 1);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -268,13 +333,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence due to "
+          "inaccessible reducer result location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -291,15 +360,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  Policy m_policy;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 template <class FunctorType, class ReducerType, class... Traits>
@@ -347,7 +419,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
@@ -355,12 +433,17 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_policy(p),
         m_space(p.space()),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -379,8 +462,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    const int nwork = m_policy.m_num_tiles;
-    const int block_size =
+    const typename Policy::index_type nwork = m_policy.m_num_tiles;
+    const typename Policy::index_type block_size =
         std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
 
     const sycl::range<1> local_range(block_size);
@@ -402,12 +485,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // FIXME_SYCL without this we are running into a race condition
     const auto results_ptr2 =
         results_ptr + std::max(value_count, 1u) * init_size;
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
@@ -424,9 +511,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -435,8 +526,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -498,47 +589,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           }
           item.barrier(sycl::access::fence_space::local_space);
 
-          // Perform the actual workgroup reduction. To achieve a better
-          // memory access pattern, we use sequential addressing and a
-          // reversed loop. If the workgroup size is 8, the first element
-          // contains all the values with index%4==0, after the second one
-          // the values with index%2==0 and after the third one index%1==0,
-          // i.e., all values.
-          for (unsigned int stride = wgroup_size / 2; stride > 0;
-               stride >>= 1) {
-            const auto idx = local_id;
-            if (idx < stride) {
-              ValueJoin::join(selected_reducer, &local_mem[idx * value_count],
-                              &local_mem[(idx + stride) * value_count]);
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-          }
-
-          // Finally, we copy the workgroup results back to global memory to
-          // be used in the next iteration. If this is the last iteration,
-          // i.e., there is only one workgroup also call final() if
-          // necessary.
-          if (local_id == 0) {
-            ValueOps::copy(
-                functor,
-                &results_ptr2[(item.get_group_linear_id()) * value_count],
-                &local_mem[0]);
-            if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-              if (n_wgroups <= 1)
-                FunctorFinal<FunctorType, WorkTag>::final(
-                    static_cast<const FunctorType&>(functor),
-                    &results_ptr2[(item.get_group_linear_id()) * value_count]);
-          }
+          SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+              item, local_mem.get_pointer(), results_ptr2,
+              device_accessible_result_ptr, value_count, selected_reducer,
+              static_cast<const FunctorType&>(functor),
+              n_wgroups <= 1 && item.get_group_linear_id() == 0);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
       // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      m_space.fence();
+      auto deep_copy_event =
+          q.memcpy(results_ptr, results_ptr2,
+                   sizeof(*m_result_ptr) * value_count * n_wgroups);
+      q.submit_barrier(std::vector<sycl::event>{deep_copy_event});
+      last_reduction_event = deep_copy_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -547,19 +612,23 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           m_space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      m_space.fence();
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence after deep "
+          "copying results back");
     }
+
+    return last_reduction_event;
   }
 
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& policy, const Functor&) {
-    return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM;
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
   }
 
   void execute() const {
@@ -575,16 +644,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  BarePolicy m_policy;
+  const FunctorType m_functor;
+  const BarePolicy m_policy;
   const Kokkos::Experimental::SYCL& m_space;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index 5eac6bf9da..d5611c2159 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <memory>
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
 
 namespace Kokkos {
@@ -86,96 +87,99 @@ class ParallelScanSYCLBase {
   void scan_internal(sycl::queue& q, const Functor& functor,
                      pointer_type global_mem, std::size_t size) const {
     // FIXME_SYCL optimize
-    constexpr size_t wgroup_size = 32;
+    constexpr size_t wgroup_size = 128;
     auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
+    pointer_type group_results   = global_mem + n_wgroups * wgroup_size;
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> group_results_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups,
-                                               q, sycl::usm::alloc::shared)),
-        deleter);
-    auto group_results = group_results_memory.get();
-
-    q.submit([&](sycl::handler& cgh) {
+    auto local_scans = q.submit([&](sycl::handler& cgh) {
       sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           local_mem(sycl::range<1>(wgroup_size), cgh);
 
-      // FIXME_SYCL we get wrong results without this, not sure why
-      sycl::stream out(1, 1, cgh);
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
-            const auto local_id  = item.get_local_linear_id();
-            const auto global_id = item.get_global_linear_id();
+            const auto local_id      = item.get_local_linear_id();
+            const auto global_id     = item.get_global_linear_id();
+            const auto global_offset = global_id - local_id;
 
             // Initialize local memory
             if (global_id < size)
-              ValueOps::copy(functor, &local_mem[local_id],
-                             &global_mem[global_id]);
+              local_mem[local_id] = global_mem[global_id];
             else
               ValueInit::init(functor, &local_mem[local_id]);
             item.barrier(sycl::access::fence_space::local_space);
 
-            // Perform workgroup reduction
-            for (size_t stride = 1; 2 * stride < wgroup_size + 1; stride *= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size)
-                ValueJoin::join(functor, &local_mem[idx],
-                                &local_mem[idx - stride]);
-              item.barrier(sycl::access::fence_space::local_space);
+            // subgroup scans
+            auto sg                = item.get_sub_group();
+            const auto sg_group_id = sg.get_group_id()[0];
+            const int id_in_sg     = sg.get_local_id()[0];
+            for (int stride = wgroup_size / 2; stride > 0; stride >>= 1) {
+              auto tmp = sg.shuffle_up(local_mem[local_id], stride);
+              if (id_in_sg >= stride)
+                ValueJoin::join(functor, &local_mem[local_id], &tmp);
             }
 
-            if (local_id == 0) {
-              if (n_wgroups > 1)
-                ValueOps::copy(functor,
-                               &group_results[item.get_group_linear_id()],
-                               &local_mem[wgroup_size - 1]);
-              else
-                ValueInit::init(functor,
-                                &group_results[item.get_group_linear_id()]);
-              ValueInit::init(functor, &local_mem[wgroup_size - 1]);
-            }
+            const int local_range = sg.get_local_range()[0];
+            if (id_in_sg == local_range - 1)
+              global_mem[sg_group_id + global_offset] = local_mem[local_id];
+            local_mem[local_id] = sg.shuffle_up(local_mem[local_id], 1);
+            if (id_in_sg == 0) ValueInit::init(functor, &local_mem[local_id]);
+            item.barrier(sycl::access::fence_space::local_space);
 
-            // Add results to all items
-            for (size_t stride = wgroup_size / 2; stride > 0; stride /= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size) {
-                value_type dummy;
-                ValueOps::copy(functor, &dummy, &local_mem[idx - stride]);
-                ValueOps::copy(functor, &local_mem[idx - stride],
-                               &local_mem[idx]);
-                ValueJoin::join(functor, &local_mem[idx], &dummy);
+            // scan subgroup results using the first subgroup
+            if (sg_group_id == 0) {
+              const int n_subgroups = sg.get_group_range()[0];
+              if (local_range < n_subgroups) Kokkos::abort("Not implemented!");
+
+              for (int stride = n_subgroups / 2; stride > 0; stride >>= 1) {
+                auto tmp =
+                    sg.shuffle_up(global_mem[id_in_sg + global_offset], stride);
+                if (id_in_sg >= stride) {
+                  if (id_in_sg < n_subgroups)
+                    ValueJoin::join(
+                        functor, &global_mem[id_in_sg + global_offset], &tmp);
+                  else
+                    global_mem[id_in_sg + global_offset] = tmp;
+                }
               }
-              item.barrier(sycl::access::fence_space::local_space);
             }
+            item.barrier(sycl::access::fence_space::local_space);
+
+            // add results to all subgroups
+            if (sg_group_id > 0)
+              ValueJoin::join(functor, &local_mem[local_id],
+                              &global_mem[sg_group_id - 1 + global_offset]);
+            item.barrier(sycl::access::fence_space::local_space);
+            if (n_wgroups > 1 && local_id == wgroup_size - 1)
+              group_results[item.get_group_linear_id()] =
+                  global_mem[sg_group_id + global_offset];
+            item.barrier(sycl::access::fence_space::local_space);
 
             // Write results to global memory
-            if (global_id < size)
-              ValueOps::copy(functor, &global_mem[global_id],
-                             &local_mem[local_id]);
+            if (global_id < size) global_mem[global_id] = local_mem[local_id];
           });
     });
+    q.submit_barrier(std::vector<sycl::event>{local_scans});
 
-    if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups);
-    m_policy.space().fence();
-
-    q.submit([&](sycl::handler& cgh) {
-      cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-                       [=](sycl::nd_item<1> item) {
-                         const auto global_id = item.get_global_linear_id();
-                         if (global_id < size)
-                           ValueJoin::join(
-                               functor, &global_mem[global_id],
-                               &group_results[item.get_group_linear_id()]);
-                       });
-    });
-    m_policy.space().fence();
+    if (n_wgroups > 1) {
+      scan_internal(q, functor, group_results, n_wgroups);
+      auto update_with_group_results = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            [=](sycl::nd_item<1> item) {
+              const auto global_id = item.get_global_linear_id();
+              if (global_id < size)
+                ValueJoin::join(functor, &global_mem[global_id],
+                                &group_results[item.get_group_linear_id()]);
+            });
+      });
+      q.submit_barrier(std::vector<sycl::event>{update_with_group_results});
+    }
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
@@ -185,7 +189,7 @@ class ParallelScanSYCLBase {
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // Initialize global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto initialize_global_memory = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       auto begin      = m_policy.begin();
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
@@ -197,29 +201,30 @@ class ParallelScanSYCLBase {
           functor(id, update, false);
         else
           functor(WorkTag(), id, update, false);
-        ValueOps::copy(functor, &global_mem[id], &update);
+        global_mem[id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{initialize_global_memory});
 
-    // Perform the actual exlcusive scan
+    // Perform the actual exclusive scan
     scan_internal(q, functor, m_scratch_space, len);
 
     // Write results to global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto update_global_results = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        auto global_id = item.get_id();
+        auto global_id = item.get_id(0);
 
         value_type update = global_mem[global_id];
         if constexpr (std::is_same<WorkTag, void>::value)
           functor(global_id, update, true);
         else
           functor(WorkTag(), global_id, update, true);
-        ValueOps::copy(functor, &global_mem[global_id], &update);
+        global_mem[global_id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{update_global_results});
+    return update_global_results;
   }
 
  public:
@@ -227,28 +232,39 @@ class ParallelScanSYCLBase {
   void impl_execute(const PostFunctor& post_functor) {
     if (m_policy.begin() == m_policy.end()) return;
 
-    const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue;
+    auto& instance        = *m_policy.space().impl_internal_space_instance();
     const std::size_t len = m_policy.end() - m_policy.begin();
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    // consider only storing one value per block and recreate initial results in
-    // the end before doing the final pass
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> result_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q,
-                                               sycl::usm::alloc::shared)),
-        deleter);
-    m_scratch_space = result_memory.get();
+    // Compute the total amount of memory we will need. We emulate the recursive
+    // structure that is used to do the actual scan. Essentially, we need to
+    // allocate memory for the whole range and then recursively for the reduced
+    // group results until only one group is left.
+    std::size_t total_memory = 0;
+    {
+      size_t wgroup_size   = 128;
+      size_t n_nested_size = len;
+      size_t n_nested_wgroups;
+      do {
+        n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size;
+        n_nested_size    = n_nested_wgroups;
+        total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size;
+      } while (n_nested_wgroups > 1);
+      total_memory += sizeof(value_type) * wgroup_size;
+    }
+
+    // FIXME_SYCL consider only storing one value per block and recreate initial
+    // results in the end before doing the final pass
+    m_scratch_space =
+        static_cast<pointer_type>(instance.scratch_space(total_memory));
 
     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->m_indirectKernelMem;
+        indirectKernelMem = instance.m_indirectKernelMem;
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
     post_functor();
   }
 
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 738620926b..9538bf7080 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -47,8 +47,11 @@
 
 #include <Kokkos_Parallel.hpp>
 
+#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>  // workgroup_reduction
 #include <SYCL/Kokkos_SYCL_Team.hpp>
 
+#include <vector>
+
 namespace Kokkos {
 namespace Impl {
 template <typename... Properties>
@@ -63,8 +66,6 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -128,11 +129,18 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
+  // custom sycl::queue is used to initialize the execution space.
   static int vector_length_max() {
-    // FIXME_SYCL provide a reasonable value
-    return 1;
+    std::vector<size_t> sub_group_sizes =
+        execution_space{}
+            .impl_internal_space_instance()
+            ->m_queue->get_device()
+            .template get_info<sycl::info::device::sub_group_sizes>();
+    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
   }
 
+ private:
   static int verify_requested_vector_length(int requested_vector_length) {
     int test_vector_length =
         std::min(requested_vector_length, vector_length_max());
@@ -140,18 +148,14 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     // Allow only power-of-two vector_length
     if (!(is_integral_power_of_two(test_vector_length))) {
       int test_pow2 = 1;
-      for (int i = 0; i < 5; i++) {
-        test_pow2 = test_pow2 << 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
+      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
       test_vector_length = test_pow2 >> 1;
     }
 
     return test_vector_length;
   }
 
+ public:
   static int scratch_size_max(int level) {
     return level == 0 ? 1024 * 32
                       :           // FIXME_SYCL arbitrarily setting this to 32kB
@@ -160,7 +164,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -206,7 +212,21 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         m_chunk_size(0),
         m_tune_team_size(bool(team_size_request <= 0)),
         m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // FIXME_SYCL check paramters
+    // FIXME_SYCL Check that league size is permissible,
+    // https://github.com/intel/llvm/pull/4064
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        static_cast<int>(
+            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
+                      "Team size x vector length is " +
+                      std::to_string(m_team_size * m_vector_length) +
+                      " but must be smaller than ") +
+          std::to_string(
+              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
+    }
   }
 
   /** \brief  Specify league size, request team size */
@@ -311,8 +331,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
          2 * sizeof(double) - m_team_scratch_size[0]) /
         (sizeof(double) + m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -335,8 +356,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         (sizeof(double) + sizeof(value_type) * value_count +
          m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -376,14 +398,15 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_scratch_size[2];
 
   template <typename Functor>
-  void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Policy& policy,
+                                 const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    q.submit([&](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
       // host queues
       sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -399,14 +422,22 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
       cgh.parallel_for(
           sycl::nd_range<2>(
-              sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+              sycl::range<2>(m_team_size, m_league_size * m_vector_size),
               sycl::range<2>(m_team_size, m_vector_size)),
           [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+            if (item.get_sub_group().get_local_range() %
+                    item.get_local_range(1) !=
+                0)
+              Kokkos::abort(
+                  "The sub_group size is not divisible by the vector_size. "
+                  "Choose a smaller vector_size!");
+#endif
             const member_type team_member(
                 team_scratch_memory_L0.get_pointer(), shmem_begin,
                 scratch_size[0],
                 static_cast<char*>(scratch_ptr[1]) +
-                    item.get_group(0) * scratch_size[1],
+                    item.get_group(1) * scratch_size[1],
                 scratch_size[1], item);
             if constexpr (std::is_same<work_tag, void>::value)
               functor(team_member);
@@ -414,7 +445,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
               functor(work_tag(), team_member);
           });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+    return parallel_for_event;
   }
 
  public:
@@ -429,7 +461,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
@@ -451,11 +485,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -463,27 +496,17 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
+    const auto max_team_size =
+        m_policy.team_size_max(arg_functor, ParallelForTag{});
     if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
       Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
-  }
-
-  // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
-  // instance
-  ParallelFor(const ParallelFor&) = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
-  ~ParallelFor() {
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
-    sycl::free(m_scratch_ptr[1], q);
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
+          "maximal team_size is " +
+          std::to_string(max_team_size) + '!');
   }
 };
 
@@ -516,6 +539,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
   // FIXME_SYCL avoid reallocating memory for reductions
   /*  size_type* m_scratch_space;
     size_type* m_scratch_flags;
@@ -529,8 +553,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const size_type m_vector_size;
 
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -553,25 +578,25 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     sycl::queue& q = *instance.m_queue;
 
     // FIXME_SYCL optimize
-    const size_t wgroup_size = m_team_size;
-    std::size_t size         = m_league_size * m_team_size;
+    const size_t wgroup_size = m_team_size * m_vector_size;
+    std::size_t size         = m_league_size * m_team_size * m_vector_size;
     const auto init_size =
         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
         // for host queues
         sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -606,9 +631,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
                 FunctorFinal<FunctorType, WorkTag>::final(
                     static_cast<const FunctorType&>(functor), results_ptr);
+              if (device_accessible_result_ptr)
+                ValueOps::copy(functor, device_accessible_result_ptr,
+                               &results_ptr[0]);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -617,8 +646,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -638,9 +667,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         cgh.parallel_for(
             sycl::nd_range<2>(
-                sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+                sycl::range<2>(m_team_size, m_league_size * m_vector_size),
                 sycl::range<2>(m_team_size, m_vector_size)),
             [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+              if (first_run && item.get_sub_group().get_local_range() %
+                                       item.get_local_range(1) !=
+                                   0)
+                Kokkos::abort(
+                    "The sub_group size is not divisible by the vector_size. "
+                    "Choose a smaller vector_size!");
+#endif
               const auto local_id = item.get_local_linear_id();
               const auto global_id =
                   wgroup_size * item.get_group_linear_id() + local_id;
@@ -651,9 +688,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               // In the first iteration, we call functor to initialize the local
               // memory. Otherwise, the local memory is initialized with the
               // results from the previous iteration that are stored in global
-              // memory. Note that we load values_per_thread values per thread
-              // and immediately combine them to avoid too many threads being
-              // idle in the actual workgroup reduction.
+              // memory.
               if (first_run) {
                 reference_type update = ValueInit::init(
                     selected_reducer, &local_mem[local_id * value_count]);
@@ -661,7 +696,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                     team_scratch_memory_L0.get_pointer(), shmem_begin,
                     scratch_size[0],
                     static_cast<char*>(scratch_ptr[1]) +
-                        item.get_group(0) * scratch_size[1],
+                        item.get_group(1) * scratch_size[1],
                     scratch_size[1], item);
                 if constexpr (std::is_same<WorkTag, void>::value)
                   functor(team_member, update);
@@ -678,50 +713,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor),
+                  n_wgroups <= 1 && item.get_group_linear_id() == 0);
 
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-                  }
-              }
+              // FIXME_SYCL not quite sure why this is necessary
+              item.barrier(sycl::access::fence_space::global_space);
             });
       });
-      space.fence();
-
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -730,13 +733,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<TeamPolicy,SYCL>: fence because "
+          "reduction can't access result storage location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -753,8 +760,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
@@ -779,11 +788,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -791,8 +799,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
@@ -811,6 +818,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ViewType::memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
@@ -823,6 +833,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 75741438e2..6ec6204e71 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -56,64 +56,22 @@
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
 namespace Impl {
-namespace {
-auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) {
-  return q.memcpy(dst, src, n);
+
+void DeepCopySYCL(void* dst, const void* src, size_t n) {
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence before memcpy");
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence after memcpy");
 }
 
-void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst,
-                const void* src, size_t n) {
-  (void)USM_memcpy(*space.m_queue, dst, src, n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n) {
+  instance.impl_internal_space_instance()->m_queue->memcpy(dst, src, n);
 }
 
-void USM_memcpy(void* dst, const void* src, size_t n) {
-  Experimental::SYCL().fence();
-  auto event = USM_memcpy(
-      *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n);
-  Experimental::Impl::SYCLInternal::fence(event);
-}
-}  // namespace
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>::
-    DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
-             const void* src, size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence(
+      "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy");
 }
 
 }  // namespace Impl
@@ -135,6 +93,11 @@ SYCLSharedUSMSpace::SYCLSharedUSMSpace()
 SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue)
     : m_queue(std::move(queue)) {}
 
+SYCLHostUSMSpace::SYCLHostUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
+
 void* allocate_sycl(
     const char* arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle,
@@ -184,6 +147,19 @@ void* SYCLSharedUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::shared, m_queue);
 }
 
+void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* SYCLHostUSMSpace::allocate(const char* arg_label,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost,
+      sycl::usm::alloc::host, m_queue);
+}
+
 void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                      const size_t arg_alloc_size, const size_t arg_logical_size,
                      const Kokkos::Tools::SpaceHandle arg_handle,
@@ -195,6 +171,8 @@ void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                                       reported_size);
   }
 
+  SYCL::impl_static_fence(
+      "Kokkos::Impl::sycl_deallocate: fence before deallocate");
   sycl::free(arg_alloc_ptr, queue);
 }
 
@@ -223,6 +201,19 @@ void SYCLSharedUSMSpace::deallocate(const char* arg_label,
                   Kokkos::Tools::make_space_handle(name()), m_queue);
 }
 
+void SYCLHostUSMSpace::deallocate(void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void SYCLHostUSMSpace::deallocate(const char* arg_label,
+                                  void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size,
+                                  const size_t arg_logical_size) const {
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -235,6 +226,9 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
 
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
@@ -282,6 +276,27 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -317,6 +332,17 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
                      alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                       void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    label = RecordBase::m_alloc_ptr->m_label;
+  }
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
+}
+
 //----------------------------------------------------------------------------
 
 }  // namespace Impl
@@ -339,6 +365,8 @@ template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLDeviceUSMSpace>;
 template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLSharedUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLHostUSMSpace>;
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index a30cf2109a..c405ad31a5 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -92,14 +92,12 @@ class SYCLTeamMember {
     return m_item.get_group_linear_id();
   }
   KOKKOS_INLINE_FUNCTION int league_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
-    return m_item.get_group_range(0);
+    return m_item.get_group_range(1);
   }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
-    return m_item.get_local_linear_id();
+    return m_item.get_local_id(0);
   }
   KOKKOS_INLINE_FUNCTION int team_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
     return m_item.get_local_range(0);
   }
   KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); }
@@ -109,8 +107,17 @@ class SYCLTeamMember {
   //--------------------------------------------------------------------------
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val,
-                                             const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
+    val = sycl::group_broadcast(m_item.get_group(), val,
+                                sycl::id<2>(thread_id, 0));
+  }
+
+  // FIXME_SYCL remove/adapt this overload once the Intel oneAPI implementation
+  // is conforming to the SYCL2020 standard (allowing trivially-copyable types)
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
     // Wait for shared data write until all threads arrive here
     m_item.barrier(sycl::access::fence_space::local_space);
     if (m_item.get_local_id(1) == 0 &&
@@ -119,7 +126,7 @@ class SYCLTeamMember {
     }
     // Wait for shared data read until root thread writes
     m_item.barrier(sycl::access::fence_space::local_space);
-    val = *static_cast<ValueType*>(m_team_reduce);
+    val = *(static_cast<ValueType*>(m_team_reduce));
   }
 
   template <class Closure, class ValueType>
@@ -294,35 +301,43 @@ class SYCLTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+      vector_reduce(ReducerType const& reducer) const {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& /*reducer*/,
-                    typename ReducerType::value_type& /*value*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
-  }
+      vector_reduce(ReducerType const& reducer,
+                    typename ReducerType::value_type& value) const {
+    const auto tidx1   = m_item.get_local_id(1);
+    const auto grange1 = m_item.get_local_range(1);
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& /*reducer*/,
-                    int* const /*global_scratch_flags*/,
-                    void* const /*global_scratch_space*/, void* const /*shmem*/,
-                    int const /*shmem_size*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
+    const auto sg = m_item.get_sub_group();
+
+    if (grange1 == 1) return;
+
+    // Intra vector lane shuffle reduction:
+    typename ReducerType::value_type tmp(value);
+    typename ReducerType::value_type tmp2 = tmp;
+
+    for (int i = grange1; (i >>= 1);) {
+      tmp2 = sg.shuffle_down(tmp, i);
+      if (static_cast<int>(tidx1) < i) {
+        reducer.join(tmp, tmp2);
+      }
+    }
+
+    // Broadcast from root lane to all other lanes.
+    // Cannot use "butterfly" algorithm to avoid the broadcast
+    // because floating point summation is not associative
+    // and thus different threads could have different results.
+
+    tmp2  = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1);
+    value = tmp2;
+    reducer.reference() = tmp2;
   }
 
   //----------------------------------------
@@ -489,7 +504,6 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -516,7 +530,6 @@ KOKKOS_INLINE_FUNCTION
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -546,7 +559,6 @@ KOKKOS_INLINE_FUNCTION
 
   reducer.init(reducer.reference());
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -609,11 +621,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL adapt for vector_length != 1
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0))
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i);
 }
 
@@ -623,17 +638,20 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
-    closure(i, value);
-  }
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
 
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
+    closure(i, value);
+
+  loop_boundaries.member.vector_reduce(reducer, value);
   loop_boundaries.member.team_reduce(reducer, value);
 }
 
@@ -643,20 +661,23 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
-    closure(i, val);
-  }
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
 
-  loop_boundaries.member.team_reduce(reducer, val);
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
+    closure(i, val);
+
+  loop_boundaries.member.vector_reduce(reducer);
+  loop_boundaries.member.team_reduce(reducer);
   result = reducer.reference();
 }
 
@@ -673,9 +694,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYC: adapt for vector_length!=1
-  for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i)
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i);
+
+  loop_boundaries.member.item().get_sub_group().barrier();
 }
 
 //----------------------------------------------------------------------------
@@ -697,12 +723,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ReducerType const& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, reducer.reference());
-  }
+
+  loop_boundaries.member.vector_reduce(reducer);
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -722,12 +752,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   result = ValueType();
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+  const int grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, result);
-  }
+
+  loop_boundaries.member.vector_reduce(Kokkos::Sum<ValueType>(result));
 }
 
 //----------------------------------------------------------------------------
@@ -746,15 +780,59 @@ KOKKOS_INLINE_FUNCTION
     parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
                       iType, Impl::SYCLTeamMember>& loop_boundaries,
                   const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL modify for vector_length!=1
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
 
   value_type accum;
   reducer.init(accum);
+  const value_type identity = accum;
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
-    closure(i, accum, true);
+  // Loop through boundaries by vector-length chunks must scan at each iteration
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   grange1 is power of two and thus
+  //     ( end % grange1 ) == ( end & ( grange1 - 1 ) )
+  //   1 <= grange1 <= sub_group size
+
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  const int mask          = grange1 - 1;
+  const int rem           = loop_boundaries.end & mask;  // == end % grange1
+  const int end           = loop_boundaries.end + (rem ? grange1 - rem : 0);
+  const auto sg           = loop_boundaries.member.item().get_sub_group();
+  const int vector_offset = (sg.get_local_id() / grange1) * grange1;
+
+  for (int i = tidx1; i < end; i += grange1) {
+    value_type val = identity;
+
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution to make the latter shfl_up an
+    // exclusive scan -- the final accumulation of i's val will be included in
+    // the second closure call later.
+    if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
+
+    // Bottom up exclusive scan in triangular pattern where each SYCL thread is
+    // the root of a reduction tree from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+    for (int j = 1; j < static_cast<int>(grange1); j <<= 1) {
+      value_type tmp = sg.shuffle_up(val, j);
+      if (j <= static_cast<int>(tidx1)) {
+        reducer.join(val, tmp);
+      }
+    }
+
+    // Include accumulation
+    reducer.join(val, accum);
+
+    // Update i's contribution into the val and add it to accum for next round
+    if (i < loop_boundaries.end) closure(i, val, true);
+    accum = sg.shuffle(val, mask + vector_offset);
   }
 }
 
@@ -792,21 +870,26 @@ template <class FunctorType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda) {
-  if (single_struct.team_member.team_rank() == 0) lambda();
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda();
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val);
+  const sycl::nd_item<2> item = single_struct.team_member.item();
+  const auto grange1          = item.get_local_range(1);
+  const auto sg               = item.get_sub_group();
+  if (item.get_local_id(1) == 0) lambda(val);
+  val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1);
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.team_rank() == 0) lambda(val);
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda(val);
+  single_struct.team_member.team_broadcast(val, 0);
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
index 141a692f60..d2820b3b3a 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -89,7 +89,7 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
     const Kokkos::pair<int, int> result =
         Kokkos::Impl::concurrent_bitset::acquire_bounded(
             m_buffer, m_count
-#if defined(KOKKOS_ARCH_INTEL_GEN)
+#ifdef KOKKOS_ARCH_INTEL_GPU
             ,
             Kokkos::Impl::clock_tic() % m_count
 #endif
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 92bd671bd5..18ef97ae46 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -288,21 +288,46 @@ int ThreadsExec::in_parallel() {
   return s_current_function && (&s_threads_process != s_current_function_arg) &&
          (s_threads_process.m_pool_base || !is_process());
 }
+void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); }
+void ThreadsExec::fence(const std::string &name) {
+  internal_fence(name, Impl::fence_is_static::yes);
+}
+
+void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
+  internal_fence((is_static == Impl::fence_is_static::no)
+                     ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
+                     : "Kokkos::ThreadsExec::fence: Unnamed Global Fence",
+                 is_static);
+}
 
 // Wait for root thread to become inactive
-void ThreadsExec::fence() {
-  if (s_thread_pool_size[0]) {
-    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
-                                    ThreadsExec::Active);
+void ThreadsExec::internal_fence(const std::string &name,
+                                 Impl::fence_is_static is_static) {
+  const auto &fence_lam = [&]() {
+    if (s_thread_pool_size[0]) {
+      // Wait for the root thread to complete:
+      Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
+                                      ThreadsExec::Active);
+    }
+
+    s_current_function     = nullptr;
+    s_current_function_arg = nullptr;
+
+    // Make sure function and arguments are cleared before
+    // potentially re-activating threads with a subsequent launch.
+    memory_fence();
+  };
+  if (is_static == Impl::fence_is_static::yes) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        fence_lam);
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        fence_lam);
   }
-
-  s_current_function     = nullptr;
-  s_current_function_arg = nullptr;
-
-  // Make sure function and arguments are cleared before
-  // potentially re-activating threads with a subsequent launch.
-  memory_fence();
 }
 
 /** \brief  Begin execution of the asynchronous functor */
@@ -769,7 +794,12 @@ void ThreadsExec::finalize() {
 namespace Kokkos {
 
 int Threads::concurrency() { return impl_thread_pool_size(0); }
-void Threads::fence() const { Impl::ThreadsExec::fence(); }
+void Threads::fence() const {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::no);
+}
+void Threads::fence(const std::string &name) const {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
+}
 
 Threads &Threads::impl_instance(int) {
   static Threads t;
@@ -832,6 +862,9 @@ void ThreadsSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); }
+void ThreadsSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Threads::impl_static_fence(name);
+}
 
 void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
                                                   const bool detail) {
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 1c8b3ac5f6..4d9a72a034 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -63,7 +63,6 @@
 
 namespace Kokkos {
 namespace Impl {
-
 class ThreadsExec {
  public:
   // Fan array has log_2(NT) reduction threads plus 2 scan threads
@@ -474,6 +473,12 @@ class ThreadsExec {
 
   static int in_parallel();
   static void fence();
+  static void fence(const std::string &);
+  static void internal_fence(
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
+  static void internal_fence(
+      const std::string &,
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
   static bool sleep();
   static bool wake();
 
@@ -635,7 +640,12 @@ inline void Threads::print_configuration(std::ostream &s, const bool detail) {
   Impl::ThreadsExec::print_configuration(s, detail);
 }
 
-inline void Threads::impl_static_fence() { Impl::ThreadsExec::fence(); }
+inline void Threads::impl_static_fence() {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::yes);
+}
+inline void Threads::impl_static_fence(const std::string &name) {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
+}
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
index 40a09ed22a..e4eaeac781 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -100,8 +100,8 @@ bool ThreadsExec::spawn() {
 
   pthread_attr_t attr;
 
-  if (0 == pthread_attr_init(&attr) ||
-      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) ||
+  if (0 == pthread_attr_init(&attr) &&
+      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) &&
       0 == pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
     pthread_t pt;
 
diff --git a/lib/kokkos/core/src/desul/.clang-format b/lib/kokkos/core/src/desul/.clang-format
new file mode 100644
index 0000000000..9d159247d5
--- /dev/null
+++ b/lib/kokkos/core/src/desul/.clang-format
@@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: false
diff --git a/lib/kokkos/core/src/desul/atomics.hpp b/lib/kokkos/core/src/desul/atomics.hpp
new file mode 100644
index 0000000000..ab3fe25392
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics.hpp
@@ -0,0 +1,19 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_HPP_
+#define DESUL_ATOMICS_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#include "desul/atomics/Atomic_Ref.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp b/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
new file mode 100644
index 0000000000..73cd01a7e6
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMIC_REF_IMPL_HPP_
+#define DESUL_ATOMIC_REF_IMPL_HPP_
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+
+// TODO current implementation is missing the following:
+// * member functions
+//   * wait
+//   * notify_one
+//   * notify_all
+
+template <typename T,
+          typename MemoryOrder,
+          typename MemoryScope,
+          bool = std::is_integral<T>{},
+          bool = std::is_floating_point<T>{}>
+struct basic_atomic_ref;
+
+// base class for non-integral, non-floating-point, non-pointer types
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
+  static_assert(std::is_trivially_copyable<T>{}, "");
+
+ private:
+  T* _ptr;
+
+  // 1/2/4/8/16-byte types must be aligned to at least their size
+  static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16
+                                            ? 0
+                                            : sizeof(T);
+
+ public:
+  using value_type = T;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = _min_alignment > alignof(T)
+                                                        ? _min_alignment
+                                                        : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {}
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+};
+
+// base class for atomic_ref<integral-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
+  static_assert(std::is_integral<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T)
+                                                                           : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_and(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_or(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_xor(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept {
+    atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept {
+    atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept {
+    atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<floating-point-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
+  static_assert(std::is_floating_point<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<pointer-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
+ private:
+  T** _ptr;
+
+ public:
+  using value_type = T*;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T*);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T* operator=(T* desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T*() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T* desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* exchange(T* desired,
+                             _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T*), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T*& expected,
+                                            T* desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected,
+      T* desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept {
+    atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept {
+    atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+ private:
+  static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept {
+    static_assert(std::is_object<T>{}, "");
+    return d * sizeof(T);
+  }
+};
+
+}  // namespace Impl
+
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct scoped_atomic_ref : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope> {
+  explicit scoped_atomic_ref(T& obj) noexcept
+      : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>(obj) {}
+
+  scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete;
+
+  scoped_atomic_ref(scoped_atomic_ref const&) = default;
+
+  using Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>::operator=;
+};
+
+}  // namespace desul
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/CUDA.hpp b/lib/kokkos/core/src/desul/atomics/CUDA.hpp
new file mode 100644
index 0000000000..32873a5977
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/CUDA.hpp
@@ -0,0 +1,453 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_CUDA_HPP_
+#define DESUL_ATOMICS_CUDA_HPP_
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+    (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#define DESUL_HAVE_CUDA_ATOMICS_ASM
+#include <desul/atomics/cuda/CUDA_asm.hpp>
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__<700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_cuda_atomic_integer_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value ||
+                                std::is_same<T,unsigned long long int>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_add_type {
+  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+                                std::is_same<T,double>::value || 
+#endif
+                                std::is_same<T,float>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_sub_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Sub
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicInc(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_inc(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicDec(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_dec(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Max
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Min
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic And
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic XOR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic OR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+} // desul
+#endif
+
+#if !defined(__NVCC__)
+// Functions defined as device functions in CUDA which don't exist in the GCC overload set
+namespace desul {
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE,ORDER,SCOPE) \
+    inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_add(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE,ORDER,SCOPE) \
+    inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_sub(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE,ORDER,SCOPE) \
+    inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_inc(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE,ORDER,SCOPE) \
+    inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_dec(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+#endif // DESUL_HAVE_CUDA_ATOMICS_ASM
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::AddOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::SubOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  inline void atomic_fetch_max(int32_t* const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+
+}
+
+// Functions defined int the GCC overload set but not in the device overload set
+namespace desul {
+  __device__ inline
+  unsigned long long atomic_fetch_add(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_add(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_add(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_sub(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_sub(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_max(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MaxOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_min(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MinOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_or(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_or(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_xor(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_xor(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_and(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_and(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+
+
+  __device__ inline
+  unsigned long long atomic_add_fetch(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_add_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_add_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_sub_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_sub_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_or_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_or_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_xor_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_xor_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_and_fetch(long long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_and_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+}
+#endif
+#endif  // DESUL_HAVE_CUDA_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Common.hpp b/lib/kokkos/core/src/desul/atomics/Common.hpp
new file mode 100644
index 0000000000..f1dccc6c52
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Common.hpp
@@ -0,0 +1,199 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMMON_HPP_
+#define DESUL_ATOMICS_COMMON_HPP_
+#include "desul/atomics/Macros.hpp"
+#include <cstdint>
+#include <atomic>
+#include <type_traits>
+
+namespace desul {
+struct alignas(16) Dummy16ByteValue {
+  int64_t value1;
+  int64_t value2;
+  bool operator!=(Dummy16ByteValue v) const {
+    return (value1 != v.value1) || (value2 != v.value2);
+  }
+  bool operator==(Dummy16ByteValue v) const {
+    return (value1 == v.value1) && (value2 == v.value2);
+  }
+};
+}  // namespace desul
+
+// MemoryOrder Tags
+
+namespace desul {
+// Memory order sequential consistent
+struct MemoryOrderSeqCst {};
+// Memory order acquire release
+struct MemoryOrderAcqRel {};
+// Memory order acquire
+struct MemoryOrderAcquire {};
+// Memory order release
+struct MemoryOrderRelease {};
+// Memory order relaxed
+struct MemoryOrderRelaxed {};
+}  // namespace desul
+
+// Memory Scope Tags
+
+namespace desul {
+// Entire machine scope (e.g. for global arrays)
+struct MemoryScopeSystem {};
+// Node level
+struct MemoryScopeNode {};
+// Device or socket scope (i.e. a CPU socket, a single GPU)
+struct MemoryScopeDevice {};
+// Core scoped (i.e. a shared Level 1 cache)
+struct MemoryScopeCore {};
+}  // namespace desul
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+namespace desul {
+template <class MemoryOrderDesul>
+struct GCCMemoryOrder;
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr int value = __ATOMIC_RELAXED;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcquire> {
+  static constexpr int value = __ATOMIC_ACQUIRE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelease> {
+  static constexpr int value = __ATOMIC_RELEASE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr int value = __ATOMIC_ACQ_REL;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr int value = __ATOMIC_SEQ_CST;
+};
+
+template <class MemoryOrderDesul>
+struct CXXMemoryOrder;
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr std::memory_order value = std::memory_order_relaxed;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcquire> {
+  static constexpr std::memory_order value = std::memory_order_acquire;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelease> {
+  static constexpr std::memory_order value = std::memory_order_release;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr std::memory_order value = std::memory_order_acq_rel;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr std::memory_order value = std::memory_order_seq_cst;
+};
+
+namespace Impl {
+template <typename MemoryOrder>
+struct CmpExchFailureOrder {
+  using memory_order = std::conditional_t<
+      std::is_same<MemoryOrder, MemoryOrderAcqRel>{},
+      MemoryOrderAcquire,
+      std::conditional_t<std::is_same<MemoryOrder, MemoryOrderRelease>{},
+                         MemoryOrderRelaxed,
+                         MemoryOrder>>;
+};
+template <typename MemoryOrder>
+using cmpexch_failure_memory_order =
+    typename CmpExchFailureOrder<MemoryOrder>::memory_order;
+}  // namespace Impl
+
+}
+
+// We should in principle use std::numeric_limits, but that requires constexpr function support on device
+// Currently that is still considered experimetal on CUDA and sometimes not reliable.
+namespace desul {
+namespace Impl {
+template<class T>
+struct numeric_limits_max;
+
+template<>
+struct numeric_limits_max<uint32_t> {
+  static constexpr uint32_t value = 0xffffffffu;
+};
+template<>
+struct numeric_limits_max<uint64_t> {
+  static constexpr uint64_t value = 0xfffffffflu;
+};
+
+constexpr bool atomic_always_lock_free(std::size_t size) {
+  return size == 4 || size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || size == 16
+#endif
+      ;
+}
+
+template <std::size_t Size, std::size_t Align>
+DESUL_INLINE_FUNCTION bool atomic_is_lock_free() noexcept {
+  return Size == 4 || Size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || Size == 16
+#endif
+      ;
+}
+
+template<std::size_t N>
+struct atomic_compare_exchange_type;
+
+template<>
+struct atomic_compare_exchange_type<4> {
+  using type = int32_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<8> {
+  using type = int64_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<16> {
+  using type = Dummy16ByteValue;
+};
+
+template<class T>
+struct dont_deduce_this_parameter { using type = T; };
+
+template<class T>
+using dont_deduce_this_parameter_t = typename dont_deduce_this_parameter<T>::type;
+
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
new file mode 100644
index 0000000000..7b8289d75b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#include "desul/atomics/Compare_Exchange_GCC.hpp"
+#endif
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+#include "desul/atomics/Compare_Exchange_MSVC.hpp"
+#endif
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+#include "desul/atomics/Compare_Exchange_Serial.hpp"
+#endif
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#include "desul/atomics/Compare_Exchange_CUDA.hpp"
+#endif
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#include "desul/atomics/Compare_Exchange_HIP.hpp"
+#endif
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+#include "desul/atomics/Compare_Exchange_OpenMP.hpp"
+#endif
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Compare_Exchange_SYCL.hpp"
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
new file mode 100644
index 0000000000..aab0d943eb
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
@@ -0,0 +1,267 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+namespace desul {
+// Only include if compiling device code, or the CUDA compiler is not NVCC (i.e. Clang)
+// atomic_thread_fence implementation
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+#if (__CUDA_ARCH__>=600) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+#endif
+#endif
+}
+
+// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we do NOT have a way
+// of having the code included for clang only when the CC is smaller than 700
+// But on Clang the device side symbol list must be independent of __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) || \
+(!defined(__NVCC__) && (defined(KOKKOS_ENABLE_KEPLER) || defined(KOKKOS_ENABLE_MAXWELL) || defined(KOKKOS_ENABLE_PASCAL)))
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+}  // namespace desul
+#endif
+
+// Including CUDA ptx based exchange atomics
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+     (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#include <desul/atomics/cuda/CUDA_asm_exchange.hpp>
+#endif
+
+// SeqCst is not directly supported by PTX, need the additional fences:
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+}
+#endif
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        if(return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(),scope);
+        }
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+}
+#endif
+
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
new file mode 100644
index 0000000000..418bea0b8b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
@@ -0,0 +1,91 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#if !defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP) && !defined(__CUDACC__)
+// This doesn't work in WSL??
+//#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+namespace desul {
+
+namespace Impl {
+template<class T>
+struct atomic_exchange_available_gcc {
+  constexpr static bool value =
+#ifndef DESUL_HAVE_LIBATOMIC
+    ((sizeof(T)==4 && alignof(T)==4) ||
+#ifdef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+     (sizeof(T)==16 && alignof(T)==16) ||
+#endif
+     (sizeof(T)==8 && alignof(T)==8)) &&
+#endif
+    std::is_trivially_copyable<T>::value;
+};
+} //namespace Impl
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+// https://godbolt.org/z/G7YhqhbG6
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  __atomic_thread_fence(GCCMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  __atomic_exchange(
+     dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
+  return return_val;
+}
+
+// Failure mode for atomic_compare_exchange_n cannot be RELEASE nor ACQREL so
+// Those two get handled separatly.
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+  return compare;
+}
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#pragma GCC diagnostic pop
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
new file mode 100644
index 0000000000..d6bf04a7e6
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
@@ -0,0 +1,253 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+namespace desul {
+#if defined(__HIP_DEVICE_COMPILE__)
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(
+    T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+          atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+            T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+              atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+                return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_compare_exchange(
+        T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        if (return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        }
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
new file mode 100644
index 0000000000..c96cb03171
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
@@ -0,0 +1,201 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#include "desul/atomics/Common.hpp"
+#include <type_traits>
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+
+#ifndef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+
+namespace desul {
+
+template<class T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(T* const, T val, MemoryOrder, MemoryScope) { return val;}
+
+
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  std::atomic_thread_fence(CXXMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 1 && sizeof(T) != 4 && sizeof(T) != 8 && sizeof(T) != 16), T>::type atomic_compare_exchange(
+     T* const dest, T compare, T val, MemoryOrder, MemoryScope scope) {
+  while (!Impl::lock_address((void*)dest, scope)) {}
+  if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+}  // namespace desul
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
new file mode 100644
index 0000000000..a1d1c91249
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
@@ -0,0 +1,145 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#include "desul/atomics/Common.hpp"
+#include <cstdio>
+#include <omp.h>
+
+namespace desul
+{
+namespace Impl
+{
+static constexpr bool omp_on_host() { return true; }
+
+#pragma omp begin declare variant match(device = {kind(host)})
+static constexpr bool omp_on_host() { return true; }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(nohost)})
+static constexpr bool omp_on_host() { return false; }
+#pragma omp end declare variant
+} // namespace Impl
+} // namespace desul
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+namespace desul {
+
+#if _OPENMP > 201800
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush acquire
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush acquire
+}
+#else
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush
+}
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T& x = *dest;
+  #pragma omp atomic capture
+  { return_val = x; x = value; }
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that this works
+// Note that means we test this in OpenMPTarget offload regions!
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t retval = __sync_val_compare_and_swap(
+     reinterpret_cast<volatile cas_t*>(dest), 
+     reinterpret_cast<cas_t&>(compare), 
+     reinterpret_cast<cas_t&>(value));
+  return reinterpret_cast<T&>(retval);
+}
+// Make 16 byte cas work on host at least (is_initial_device check, note this requires C++17)
+#if __cplusplus>=201703L
+
+#if defined(__clang__) && (__clang_major__>=7)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_always_lock_free(sizeof(T)) && (sizeof(T)==16),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  if constexpr (desul::Impl::omp_on_host()) {
+    (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+    return compare;
+  } else {
+    return value;
+  }
+}
+#if defined(__clang__) && (__clang_major__>=7)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
new file mode 100644
index 0000000000..a8fd2ebbe2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -0,0 +1,102 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/SYCLConversions.hpp"
+#include <CL/sycl.hpp>
+
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+namespace desul {
+
+template<class MemoryOrder, class MemoryScope>
+inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  DESUL_SYCL_NAMESPACE::atomic_fence(DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                     DesulToSYCLMemoryScope<MemoryScope>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare), 
+                                   *reinterpret_cast<unsigned int*>(&value));
+  return compare;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
+                                   *reinterpret_cast<unsigned long long int*>(&value));
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  unsigned long long int return_val =
+      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return compare;  
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return value;
+}
+
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
new file mode 100644
index 0000000000..be7b46d5fa
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
@@ -0,0 +1,45 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+namespace desul {
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {
+}
+
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {
+}
+
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/GCC.hpp b/lib/kokkos/core/src/desul/atomics/GCC.hpp
new file mode 100644
index 0000000000..cd0c2bea11
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/GCC.hpp
@@ -0,0 +1,131 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_GCC_HPP_
+#define DESUL_ATOMICS_GCC_HPP_
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+
+#include<type_traits>
+/*
+Built - in Function : type __atomic_add_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_sub_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_and_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_xor_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_or_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_nand_fetch(type * ptr, type val, int memorder)
+*/
+
+#define DESUL_GCC_INTEGRAL_OP_ATOMICS(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_add(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_sub(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_and(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_or(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_xor(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_nand(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_add_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_sub_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_and_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_or_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_xor_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_nand_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }
+
+namespace desul {
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeCore)
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> compare,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Generic.hpp b/lib/kokkos/core/src/desul/atomics/Generic.hpp
new file mode 100644
index 0000000000..9d5e87ece2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Generic.hpp
@@ -0,0 +1,690 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_GENERIC_HPP_
+#define DESUL_ATOMICS_GENERIC_HPP_
+
+#include <type_traits>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+#include "desul/atomics/Macros.hpp"
+// Combination operands to be used in an Compare and Exchange based atomic
+// operation
+namespace desul {
+namespace Impl {
+
+template <class Scalar1, class Scalar2>
+struct MaxOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 > val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct MinOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 < val2;
+  }
+};
+
+// This exit early optimization causes weird compiler errors with MSVC 2019
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+template <typename Op, typename Scalar1, typename Scalar2, typename = bool>
+struct may_exit_early : std::false_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+struct may_exit_early<Op,
+                      Scalar1,
+                      Scalar2,
+                      decltype(Op::check_early_exit(std::declval<Scalar1 const&>(),
+                                                    std::declval<Scalar2 const&>()))>
+    : std::true_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const& val1, Scalar2 const& val2) {
+  return Op::check_early_exit(val1, val2);
+}
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<!may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const&, Scalar2 const&) {
+  return false;
+}
+#endif
+
+template <class Scalar1, class Scalar2>
+struct AddOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct SubOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct MulOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct DivOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct ModOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct AndOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct OrOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct XorOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct NandOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return ~(val1 & val2);
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct LShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 << val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct RShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 >> val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct StoreOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct LoadOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; }
+};
+
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  cas_t assume = oldval;
+
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    T newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return reinterpret_cast<T&>(oldval);
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  T newval = val;
+  cas_t assume = oldval;
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return newval;
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = op.apply(return_val, val);
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+// FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) return_val;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based attomic\n");
+  return val;
+#endif
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = op.apply(*dest, val);
+  *dest = return_val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+  // FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based atomic\n");
+  return val;
+#endif
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+namespace desul {
+
+// Fetch_Oper atomics: return value before operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Oper Fetch atomics: return value after operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Other atomics
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
+                                    MemoryOrder order,
+                                    MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::LoadOper<T, const T>(), const_cast<T*>(dest), T(), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
+                                        const T val,
+                                        MemoryOrder order,
+                                        MemoryScope scope) {
+  (void)Impl::atomic_fetch_oper(Impl::StoreOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_add(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_sub(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_mul(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_div(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_min(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_max(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_inc_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_add_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_dec_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_sub_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_sub(dest, T(1), order, scope);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_sub(dest, T(1), order, scope);
+}
+
+// FIXME
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong(
+    T* const dest,
+    T& expected,
+    T desired,
+    SuccessMemoryOrder success,
+    FailureMemoryOrder /*failure*/,
+    MemoryScope scope) {
+  T const old = atomic_compare_exchange(dest, expected, desired, success, scope);
+  if (old != expected) {
+    expected = old;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_weak(T* const dest,
+                                                        T& expected,
+                                                        T desired,
+                                                        SuccessMemoryOrder success,
+                                                        FailureMemoryOrder failure,
+                                                        MemoryScope scope) {
+  return atomic_compare_exchange_strong(
+      dest, expected, desired, success, failure, scope);
+}
+
+}  // namespace desul
+
+#include <desul/atomics/SYCL.hpp>
+#include <desul/atomics/CUDA.hpp>
+#include <desul/atomics/GCC.hpp>
+#include <desul/atomics/HIP.hpp>
+#include <desul/atomics/OpenMP.hpp>
+#pragma GCC diagnostic pop
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/HIP.hpp b/lib/kokkos/core/src/desul/atomics/HIP.hpp
new file mode 100644
index 0000000000..16c1f510b7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/HIP.hpp
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_HIP_HPP_
+#define DESUL_ATOMICS_HIP_HPP_
+
+#ifdef __HIP_DEVICE_COMPILE__
+namespace desul {
+namespace Impl {
+template <typename T>
+struct is_hip_atomic_integer_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long int>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_add_type {
+  static constexpr bool value = is_hip_atomic_integer_type<T>::value ||
+                                std::is_same<T, double>::value ||
+                                std::is_same<T, float>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_sub_type {
+  static constexpr bool value =
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value;
+};
+}  // namespace Impl
+
+// Atomic Add
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, val);
+  __threadfence();
+
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Sub
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicInc(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_inc(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Dec
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicDec(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_dec(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Max
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Min
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic And
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic XOR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic OR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+}
+
+#define DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }
+namespace desul {
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeCore)
+}  // namespace desul
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp
new file mode 100644
index 0000000000..8fd0e8bbd7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_HPP_
+
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+struct host_locks__ {
+  static constexpr uint32_t HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  static constexpr uint32_t HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  template <typename is_always_void = void>
+  static int32_t* get_host_locks_() {
+    static int32_t HOST_SPACE_ATOMIC_LOCKS_DEVICE[HOST_SPACE_ATOMIC_MASK + 1] = {0};
+    return HOST_SPACE_ATOMIC_LOCKS_DEVICE;
+  }
+  static inline int32_t* get_host_lock_(void* ptr) {
+    return &get_host_locks_()[((uint64_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
+                              HOST_SPACE_ATOMIC_XOR_MASK];
+  }
+};
+
+inline void init_lock_arrays() {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    host_locks__::get_host_locks_();
+    is_initialized = true;
+  }
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  init_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  init_lock_arrays_hip();
+#endif
+}
+
+inline void finalize_lock_arrays() {
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  finalize_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  finalize_lock_arrays_hip();
+#endif
+}
+template <typename MemoryScope>
+inline bool lock_address(void* ptr, MemoryScope ms) {
+  return 0 == atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                      int32_t(1),
+                                      MemoryOrderSeqCst(),
+                                      ms);
+}
+template <typename MemoryScope>
+void unlock_address(void* ptr, MemoryScope ms) {
+  (void)atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                int32_t(0),
+                                MemoryOrderSeqCst(),
+                                ms);
+}
+}  // namespace Impl
+}  // namespace desul
+
+#endif  // DESUL_ATOMICS_LOCK_ARRAY_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
new file mode 100644
index 0000000000..de99185349
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
@@ -0,0 +1,172 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+
+#include "desul/atomics/Macros.hpp"
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __CUDA_ARCH__
+#define DESUL_IMPL_BALLOT_MASK(m, x) __ballot_sync(m, x)
+#define DESUL_IMPL_ACTIVEMASK __activemask()
+#else
+#define DESUL_IMPL_BALLOT_MASK(m, x) m==0?0:1
+#define DESUL_IMPL_ACTIVEMASK 0
+#endif
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h;
+
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template<typename /*AlwaysInt*/ = int>
+void init_lock_arrays_cuda();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_cuda();
+
+}  // namespace Impl
+}  // namespace desul
+
+#if defined(__CUDACC__)
+
+namespace desul {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully acquiring a lock with
+/// lock_address.
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thingy
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }
+
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
new file mode 100644
index 0000000000..9e6f5e5980
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+#define DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define DESUL_IMPL_BALLOT_MASK(x) __ballot(x)
+#endif
+
+/**
+ * \brief This global variable in Host space is the central definition of these
+ * arrays.
+ */
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void init_lock_arrays_hip();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_hip();
+}  // namespace Impl
+}  // namespace desul
+
+#ifdef __HIPCC__
+namespace desul {
+namespace Impl {
+
+/**
+ * \brief This global variable in HIP space is what kernels use to get access
+ * to the lock arrays.
+ *
+ * When relocatable device code is enabled, there can be one single instance of
+ * this global variable for the entire executable, whose definition will be in
+ * Kokkos_HIP_Locks.cpp (and whose declaration here must then be extern.  This
+ * one instance will be initialized by initialize_host_hip_lock_arrays and need
+ * not be modified afterwards.
+ *
+ * When relocatable device code is disabled, an instance of this variable will
+ * be created in every translation unit that sees this header file (we make this
+ * clear by marking it static, meaning no other translation unit can link to
+ * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
+ * instances in other translation units, we must update this CUDA global
+ * variable based on the Host global variable prior to running any kernels that
+ * will use it.  That is the purpose of the
+ * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
+ */
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
+
+#define HIP_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/**
+ * \brief Release lock for the address
+ *
+ * This function releases the lock for the hash value derived from the provided
+ * ptr. This function should only be called after previously successfully
+ * acquiring a lock with lock_address.
+ */
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+#endif
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thing
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+
+/* It is critical that this code be a macro, so that it will
+   capture the right address for g_device_hip_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                               \
+  {                                                                               \
+    if (::desul::Impl::lock_array_copied == 0) {                                  \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,          \
+                        sizeof(int32_t*));                                        \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE),   \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,            \
+                        sizeof(int32_t*));                                        \
+    }                                                                             \
+    ::desul::Impl::lock_array_copied = 1;                                         \
+  }
+
+#endif
+
+#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__))
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Macros.hpp b/lib/kokkos/core/src/desul/atomics/Macros.hpp
new file mode 100644
index 0000000000..db9962e03b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Macros.hpp
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_MACROS_HPP_
+#define DESUL_ATOMICS_MACROS_HPP_
+
+// Macros
+
+#if defined(__GNUC__) && \
+    (!defined(__CUDA_ARCH__) || !defined(__NVCC__)) && \
+    (!defined(__HIP_DEVICE_COMPILE) || !defined(__HIP_PLATFORM_HCC__)) && \
+    !defined(__SYCL_DEVICE_ONLY__) && \
+    !defined(DESUL_HAVE_OPENMP_ATOMICS) && \
+    !defined(DESUL_HAVE_SERIAL_ATOMICS)
+#define DESUL_HAVE_GCC_ATOMICS
+#endif
+
+#ifdef _MSC_VER
+#define DESUL_HAVE_MSVC_ATOMICS
+#endif
+
+#ifdef __CUDACC__
+#define DESUL_HAVE_CUDA_ATOMICS
+#endif
+
+#ifdef __HIPCC__
+#define DESUL_HAVE_HIP_ATOMICS
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define DESUL_HAVE_SYCL_ATOMICS
+#ifdef __clang__
+#define DESUL_SYCL_NAMESPACE sycl::ONEAPI
+#else
+#define DESUL_SYCL_NAMESPACE sycl
+#endif
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
+#define DESUL_HAVE_GPU_LIKE_PROGRESS
+#endif
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS)
+#define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__
+#define DESUL_INLINE_FUNCTION inline __host__ __device__
+#define DESUL_FUNCTION __host__ __device__
+#else
+#define DESUL_FORCEINLINE_FUNCTION inline
+#define DESUL_INLINE_FUNCTION inline
+#define DESUL_FUNCTION
+#endif
+
+#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+#define DESUL_HAVE_FORWARD_PROGRESS
+#endif
+
+#endif  // DESUL_ATOMICS_MACROS_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/OpenMP.hpp b/lib/kokkos/core/src/desul/atomics/OpenMP.hpp
new file mode 100644
index 0000000000..3fa22c36ac
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/OpenMP.hpp
@@ -0,0 +1,15 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_OPENMP_HPP_
+#define DESUL_ATOMICS_OPENMP_HPP_
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+
+#include<desul/atomics/openmp/OpenMP_40.hpp>
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/SYCL.hpp b/lib/kokkos/core/src/desul/atomics/SYCL.hpp
new file mode 100644
index 0000000000..44e2dc0ec4
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/SYCL.hpp
@@ -0,0 +1,143 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_SYCL_HPP_
+#define DESUL_ATOMICS_SYCL_HPP_
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_sycl_atomic_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+				std::is_same<T, long>::value ||
+				std::is_same<T, unsigned long>::value ||
+				std::is_same<T, long long>::value ||
+                                std::is_same<T, unsigned long long int>::value ||
+				std::is_same<T, float>::value ||
+				std::is_same<T, double>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*dest);
+  return dest_ref.fetch_add(val);
+}
+
+// Atomic Sub 
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_sub(val);
+}
+
+// Atomic Inc
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_add(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Dec
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_sub(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Max
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_max(val);
+}
+
+// Atomic Min
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_min(val);
+}
+
+// Atomic And
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_and(val);
+}
+
+// Atomic XOR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_xor(val);
+}
+
+// Atomic OR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_or(val);
+}
+
+} // desul
+#endif  // DESUL_HAVE_SYCL_ATOMICS
+#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp b/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp
new file mode 100644
index 0000000000..a66e5cf051
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp
@@ -0,0 +1,58 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#define DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+#include <CL/sycl.hpp>
+
+namespace desul {
+
+template<class MemoryOrder>
+struct DesulToSYCLMemoryOrder;
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::seq_cst;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcquire> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acquire;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelease> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::release;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acq_rel;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::relaxed;
+};
+
+template<class MemoryScope>
+struct DesulToSYCLMemoryScope;
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeCore> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::work_group;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeDevice> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::device;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeSystem> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::system;
+};
+
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
new file mode 100644
index 0000000000..461d3e0928
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
@@ -0,0 +1,18 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+// Choose the variant of atomics we are using later
+#if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL)
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__==11) && (__CUDACC_VER_MINOR__>1))
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#else
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#endif
+#endif
+#include<desul/atomics/cuda/cuda_cc7_asm.inc>
+
+#endif
+}
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
new file mode 100644
index 0000000000..0ab95e6a00
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
@@ -0,0 +1,8 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+
+#include<desul/atomics/cuda/cuda_cc7_asm_exchange.inc>
+
+#endif
+}
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
new file mode 100644
index 0000000000..2bc64a74b2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
new file mode 100644
index 0000000000..6de590a952
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_fetch_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_fetch_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_forceglobal"
+#endif
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
new file mode 100644
index 0000000000..d00e2223d2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
@@ -0,0 +1,143 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
new file mode 100644
index 0000000000..364b6a2e4d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
@@ -0,0 +1,142 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops: 
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
new file mode 100644
index 0000000000..2e8e54062d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
@@ -0,0 +1,190 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
new file mode 100644
index 0000000000..5f53279daf
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
@@ -0,0 +1,226 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.inc.gobal" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.inc"       __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
new file mode 100644
index 0000000000..ca02410515
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_forceglobal"
+#endif
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
new file mode 100644
index 0000000000..3767b2ab49
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
new file mode 100644
index 0000000000..5de36a3e0a
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
new file mode 100644
index 0000000000..ba89378834
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
@@ -0,0 +1,88 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
new file mode 100644
index 0000000000..46e0ccf5e7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
@@ -0,0 +1,106 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
new file mode 100644
index 0000000000..dfd211249f
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
new file mode 100644
index 0000000000..7b4f7d094e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
@@ -0,0 +1,27 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
new file mode 100644
index 0000000000..51d992087e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
@@ -0,0 +1,40 @@
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_compare = reinterpret_cast<uint32_t&>(compare); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2,%3;" : "=r"(asm_result) : "l"(dest),"r"(asm_compare),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_compare = reinterpret_cast<uint64_t&>(compare); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2,%3;" : "=l"(asm_result) : "l"(dest),"l"(asm_compare),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE()
+__DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
new file mode 100644
index 0000000000..3eb613d8a7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
@@ -0,0 +1,29 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
new file mode 100644
index 0000000000..f4f1bbd96e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
@@ -0,0 +1,97 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_OPENMP40_HPP_
+#define DESUL_ATOMICS_OPENMP40_HPP_
+#include<type_traits>
+
+namespace desul {
+namespace Impl {
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcquire, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcquire(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderRelease, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderRelease(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class T>
+  struct is_openmp_atomic_type_t {
+    static constexpr bool value = std::is_arithmetic<T>::value;
+  };
+  template<class T>
+  constexpr bool is_openmp_atomic_type_v = is_openmp_atomic_type_t<T>::value;
+}
+}
+
+namespace desul {
+// Can't use a macro approach to get all definitions since the ops include #pragma omp
+// So gonna use multiple inclusion of the same code snippet here.
+
+// Can't do Node level atomics this way with OpenMP Target, but we could 
+// have a define which says whether or not Device level IS node level (e.g. for pure CPU node)
+
+#define MEMORY_ORDER MemoryOrderRelaxed
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderAcqRel
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderSeqCst
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+}  // namespace desul
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
new file mode 100644
index 0000000000..a65f2a457d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
@@ -0,0 +1,101 @@
+
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_add(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest += value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_sub(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest -= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_and(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest &= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_or(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest |= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_xor(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest ^= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_add_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest += value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_sub_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest -= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_and_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest &= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_or_fetch(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest |= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_xor_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest ^= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }
diff --git a/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp b/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
new file mode 100644
index 0000000000..8913f8bc7b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
@@ -0,0 +1,98 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <desul/atomics/Lock_Array.hpp>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#ifdef __CUDACC_RDC__
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_cuda_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < CUDA_SPACE_ATOMIC_MASK + 1) {
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_cuda(cudaError e, const std::string msg) {
+  if(e != cudaSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << cudaGetErrorName(e)
+                  << "): " << cudaGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+// define functions
+template<typename T>
+void init_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+  auto error_malloc1 = cudaMalloc(&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc1, "init_lock_arrays_cuda: cudaMalloc device locks");
+
+  auto error_malloc2 = cudaMallocHost(&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc2, "init_lock_arrays_cuda: cudaMalloc host locks");
+
+  auto error_sync1 = cudaDeviceSynchronize();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
+  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+  auto error_sync2 = cudaDeviceSynchronize();
+  check_error_and_throw_cuda(error_sync2, "init_lock_arrays_cuda: post init kernel");
+}
+
+template<typename T>
+void finalize_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  cudaFree(CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h);
+  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef __CUDACC_RDC__
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+// Instantiate functions
+template void init_lock_arrays_cuda<int>();
+template void finalize_lock_arrays_cuda<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
diff --git a/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp b/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
new file mode 100644
index 0000000000..40030df643
--- /dev/null
+++ b/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
@@ -0,0 +1,101 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <cinttypes>
+#include <desul/atomics/Lock_Array.hpp>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#ifdef DESUL_HIP_RDC
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}  // namespace Impl
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_hip_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < HIP_SPACE_ATOMIC_MASK + 1) {
+    Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_hip(hipError_t e, const std::string msg) {
+  if(e != hipSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << hipGetErrorName(e)
+                  << "): " << hipGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+template<typename T>
+void init_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+
+  auto error_malloc1 = hipMalloc(&HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
+            sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc1, "init_lock_arrays_hip: hipMalloc device locks");
+
+  auto error_malloc2 = hipHostMalloc(&HIP_SPACE_ATOMIC_LOCKS_NODE_h,
+                sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc2, "init_lock_arrays_hip: hipMallocHost host locks");
+
+  auto error_sync1 = hipDeviceSynchronize();
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc");
+
+  init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+
+  auto error_sync2 = hipDeviceSynchronize();
+  check_error_and_throw_hip(error_sync2, "init_lock_arrays_hip: post init");
+}
+
+template<typename T>
+void finalize_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  auto error_free1 = hipFree(HIP_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  check_error_and_throw_hip(error_free1, "finalize_lock_arrays_hip: free device locks");
+  auto error_free2 = hipHostFree(HIP_SPACE_ATOMIC_LOCKS_NODE_h);
+  check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks");
+  HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef DESUL_HIP_RDC
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+template void init_lock_arrays_hip<int>();
+template void finalize_lock_arrays_hip<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
+
diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
index 7754daa8a0..0ce680cd69 100644
--- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
+++ b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
@@ -52,6 +52,8 @@ class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device, not accessible from
                            ///< the host
 class SYCLSharedUSMSpace;  ///< Memory space accessible from both the SYCL
                            ///< device and the host
+class SYCLHostUSMSpace;    ///< Memory space accessible from both the SYCL
+                           ///< device and the host (host pinned)
 class SYCL;                ///< Execution space for SYCL
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 7f72b3983f..5167c9ed65 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1518,28 +1518,14 @@ struct Tile_Loop_Type<
 };
 // end Structs for calling loops
 
-template <typename T>
-using is_void_type = std::is_same<T, void>;
-
-template <typename T>
-struct is_type_array : std::false_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_type_array<T[]> : std::true_type {
-  using value_type = T;
-};
-
 template <typename RP, typename Functor, typename Tag = void,
           typename ValueType = void, typename Enable = void>
 struct HostIterateTile;
 
 // For ParallelFor
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<is_void_type<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<std::is_void<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -1947,10 +1933,9 @@ struct HostIterateTile<
 // For ParallelReduce
 // ValueType - scalar: For reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            !is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        !std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -2370,17 +2355,16 @@ struct HostIterateTile<
 // Extra specialization for array reductions
 // ValueType[]: For array reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
   using value_type =
-      typename is_type_array<ValueType>::value_type;  // strip away the
-                                                      // 'array-ness' [], only
-                                                      // underlying type remains
+      std::remove_extent_t<ValueType>;  // strip away the
+                                        // 'array-ness' [], only
+                                        // underlying type remains
 
   inline HostIterateTile(
       RP const& rp, Functor const& func,
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index c513817b5b..20fc6268c7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -63,12 +63,39 @@
 namespace Kokkos {
 namespace Impl {
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// <editor-fold desc="AnalyzePolicyBaseTraits"> {{{1
 
-using execution_policy_trait_specifications =
-    type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait,
-              IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait,
-              ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>;
+// Mix in the defaults (base_traits) for the traits that aren't yet handled
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="MSVC EBO failure workaround"> {{{2
+
+template <class TraitSpecList>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
+template <class... TraitSpecifications>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
+    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
+    : TraitSpecifications::base_traits... {};
+
+// </editor-fold> end AnalyzePolicyBaseTraits }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+//------------------------------------------------------------------------------
+// Note: unspecialized, so that the default pathway is to fall back to using
+// the PolicyTraitMatcher. See AnalyzeExecPolicyUseMatcher below
+template <class Enable, class... Traits>
+struct AnalyzeExecPolicy
+    : AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...> {
+  using base_t =
+      AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...>;
+  using base_t::base_t;
+};
 
 //------------------------------------------------------------------------------
 // Ignore void for backwards compatibility purposes, though hopefully no one is
@@ -81,15 +108,6 @@ struct AnalyzeExecPolicy<void, void, Traits...>
 };
 
 //------------------------------------------------------------------------------
-// Mix in the defaults (base_traits) for the traits that aren't yet handled
-
-template <class TraitSpecList>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
-template <class... TraitSpecifications>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
-    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
-    : TraitSpecifications::base_traits... {};
-
 template <>
 struct AnalyzeExecPolicy<void>
     : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> {
@@ -108,6 +126,68 @@ struct AnalyzeExecPolicy<void>
   }
 };
 
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicyUseMatcher"> {{{1
+
+// We can avoid having to have policies specialize AnalyzeExecPolicy themselves
+// by piggy-backing off of the PolicyTraitMatcher that we need to have for
+// things like require() anyway. We mixin the effects of the trait using
+// the `mixin_matching_trait` nested alias template in the trait specification
+
+// General PolicyTraitMatcher version
+
+// Matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : TraitSpec::template mixin_matching_trait<
+          Trait, AnalyzeExecPolicy<void, Traits...>> {
+  using base_t = typename TraitSpec::template mixin_matching_trait<
+      Trait, AnalyzeExecPolicy<void, Traits...>>;
+  using base_t::base_t;
+};
+
+// Non-matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<!PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>, Trait,
+                                  Traits...> {
+  using base_t = AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>,
+                                             Trait, Traits...>;
+  using base_t::base_t;
+};
+
+// No match found case:
+template <class>
+struct show_name_of_invalid_execution_policy_trait;
+template <class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>, Trait, Traits...> {
+  static constexpr auto trigger_error_message =
+      show_name_of_invalid_execution_policy_trait<Trait>{};
+  static_assert(
+      /* always false: */ std::is_void<Trait>::value,
+      "Unknown execution policy trait. Search compiler output for "
+      "'show_name_of_invalid_execution_policy_trait' to see the type of the "
+      "invalid trait.");
+};
+
+// All traits matched case:
+template <>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>>
+    : AnalyzeExecPolicy<void> {
+  using base_t = AnalyzeExecPolicy<void>;
+  using base_t::base_t;
+};
+
+// </editor-fold> end AnalyzeExecPolicyUseMatcher }}}1
+//==============================================================================
+
 //------------------------------------------------------------------------------
 // Used for defaults that depend on other analysis results
 template <class AnalysisResults>
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index dd571eb6d7..d481a8dc0f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -51,10 +51,6 @@
     !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP)
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 #include <impl/Kokkos_Atomic_Memory_Order.hpp>
 #include <impl/Kokkos_Memory_Fence.hpp>
 
@@ -115,13 +111,9 @@ __inline__ __device__ T atomic_compare_exchange(
                             const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -134,11 +126,7 @@ __inline__ __device__ T atomic_compare_exchange(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
index bbea3c99b8..4bb8b4fd52 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
@@ -51,10 +51,6 @@
 #ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index f2c1c756a9..cd840983d8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP)
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -122,13 +118,9 @@ atomic_exchange(volatile T* const dest,
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
 
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -141,11 +133,7 @@ atomic_exchange(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 5c3f825ed1..9a2b13debc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP)
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -148,13 +144,9 @@ atomic_fetch_add(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -169,11 +161,7 @@ atomic_fetch_add(volatile T* const dest,
       }
     }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index c3446ae6a3..148ed97442 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP)
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -143,13 +139,9 @@ atomic_fetch_sub(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -162,11 +154,7 @@ atomic_fetch_sub(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 28ac7a3bab..f6bdbca729 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -47,10 +47,6 @@
 #define KOKKOS_ATOMIC_GENERIC_HPP
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 // Combination operands to be used in an Compare and Exchange based atomic
 // operation
 namespace Kokkos {
@@ -301,12 +297,8 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
   // This is a way to (hopefully) avoid dead lock in a warp
   T return_val;
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -319,11 +311,7 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
@@ -377,12 +365,8 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -395,11 +379,7 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 975318b7dd..f763f8c791 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -339,9 +339,8 @@ class AtomicDataElement {
   }
 
   KOKKOS_INLINE_FUNCTION
-  operator volatile non_const_value_type() volatile const {
-    // return Kokkos::atomic_load(ptr);
-    return *ptr;
+  operator non_const_value_type() volatile const {
+    return Kokkos::Impl::atomic_load(ptr);
   }
 };
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 4e46b8d157..87f18604da 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -55,7 +55,7 @@
 // To use OpenCL(TM) built-in intrinsics inside kernels, we have to
 // forward-declare their prototype, also see
 // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md
-#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
 extern SYCL_EXTERNAL unsigned long __attribute__((overloadable))
 intel_get_cycle_counter();
@@ -85,7 +85,7 @@ uint64_t clock_tic() noexcept {
 
   return clock64();
 
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
   return intel_get_cycle_counter();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 06681a95ae..4ec8513191 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -76,17 +76,17 @@ struct CombinedReducerValueItemImpl {
       CombinedReducerValueItemImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl(
       CombinedReducerValueItemImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl&&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueItemImpl() = default;
   explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value)
       : m_value(std::move(arg_value)) {}
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 value_type& ref() & noexcept { return m_value; }
+  constexpr value_type& ref() & noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr value_type const& ref() const& noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
@@ -112,11 +112,11 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
   KOKKOS_DEFAULTED_FUNCTION
   constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
+  constexpr CombinedReducerValueImpl& operator=(
       CombinedReducerValueImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
-      CombinedReducerValueImpl&&) = default;
+  constexpr CombinedReducerValueImpl& operator=(CombinedReducerValueImpl&&) =
+      default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueImpl() = default;
 
@@ -165,20 +165,19 @@ struct CombinedReducerStorageImpl {
   // model Reducer
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _init(value_type& val) const {
+  constexpr _fold_comma_emulation_return _init(value_type& val) const {
     m_reducer.init(val);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type& dest, value_type const& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type& dest, value_type const& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type volatile& dest, value_type const volatile& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type volatile& dest, value_type const volatile& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
@@ -242,10 +241,10 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
       CombinedReducerImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
-      CombinedReducerImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+      CombinedReducerImpl&&)                                       = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default;
@@ -257,9 +256,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
                                                        reducers)...,
         m_value_view(&value) {}
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void join(value_type& dest,
-                                                value_type const& src) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void join(value_type& dest,
+                                      value_type const& src) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_join(
             dest.template get<Idxs, typename Reducers::value_type>(),
@@ -274,8 +272,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
             src.template get<Idxs, typename Reducers::value_type>())...);
   }
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void init(value_type& dest) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void init(value_type& dest) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_init(
             dest.template get<Idxs, typename Reducers::value_type>())...);
@@ -298,7 +295,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   }
 
   KOKKOS_FUNCTION
-  KOKKOS_CONSTEXPR_14 static void write_value_back_to_original_references(
+  constexpr static void write_value_back_to_original_references(
       value_type const& value,
       Reducers const&... reducers_that_reference_original_values) noexcept {
     emulate_fold_comma_operator(
@@ -360,10 +357,10 @@ struct CombinedReductionFunctorWrapperImpl<
   constexpr CombinedReductionFunctorWrapperImpl(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReductionFunctorWrapperImpl() = default;
@@ -551,7 +548,7 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
                      ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   // Since we don't support asynchronous combined reducers yet for various
   // reasons, we actually just want to work with the pointers and references
@@ -581,8 +578,11 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
 
   reduce_adaptor_t::execute(label, policy, combined_functor, combined_reducer);
   Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            combined_reducer_type>::fence(policy.space(),
-                                                          combined_reducer);
+                            combined_reducer_type>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          combined_reducer);
   combined_reducer.write_value_back_to_original_references(
       value, Impl::_make_reducer_from_arg<space_type>(returnType1),
       Impl::_make_reducer_from_arg<space_type>(returnType2),
@@ -596,7 +596,7 @@ auto parallel_reduce(PolicyType const& policy, Functor const& functor,
                      ReturnType1&& returnType1, ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   Kokkos::parallel_reduce("", policy, functor,
                           std::forward<ReturnType1>(returnType1),
diff --git a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
index c02f4acdda..dafe57f8da 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@@ -138,15 +138,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -222,15 +222,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -300,7 +300,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     // Flush the store-release
     Kokkos::memory_fence();
@@ -336,7 +337,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     return (count & state_used_mask) - 1;
   }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index b4769fbeaa..a1f9d33632 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -130,6 +130,11 @@ void ExecSpaceManager::static_fence() {
     to_fence.second->fence();
   }
 }
+void ExecSpaceManager::static_fence(const std::string& name) {
+  for (auto& to_fence : exec_space_factory_list) {
+    to_fence.second->fence(name);
+  }
+}
 void ExecSpaceManager::print_configuration(std::ostream& msg,
                                            const bool detail) {
   for (auto& to_print : exec_space_factory_list) {
@@ -506,11 +511,6 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
-#endif
-#ifdef KOKKOS_ENABLE_MPI
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes");
-#else
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no");
 #endif
   declare_configuration_metadata("architecture", "Default Device",
                                  typeid(Kokkos::DefaultExecutionSpace).name());
@@ -564,7 +564,9 @@ void finalize_internal(const bool all_spaces = false) {
   g_tune_internals = false;
 }
 
-void fence_internal() { Impl::ExecSpaceManager::get_instance().static_fence(); }
+void fence_internal(const std::string& name) {
+  Impl::ExecSpaceManager::get_instance().static_fence(name);
+}
 
 bool check_arg(char const* arg, char const* expected) {
   std::size_t arg_len = std::strlen(arg);
@@ -1092,7 +1094,8 @@ void finalize_all() {
   Impl::finalize_internal(all_spaces);
 }
 
-void fence() { Impl::fence_internal(); }
+void fence() { Impl::fence_internal("Kokkos::fence: Unnamed Global Fence"); }
+void fence(const std::string& name) { Impl::fence_internal(name); }
 
 void print_helper(std::ostringstream& out,
                   const std::map<std::string, std::string>& print_me) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
index a124511c07..dc8e5e4d83 100644
--- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
@@ -79,20 +79,6 @@ struct EBOBaseImpl;
 
 template <class T, template <class...> class CtorNotOnDevice>
 struct EBOBaseImpl<T, true, CtorNotOnDevice> {
-  /*
-   * Workaround for constexpr in C++11: we need to still call T(args...), but we
-   * can't do so in the body of a constexpr function (in C++11), and there's no
-   * data member to construct into. But we can construct into an argument
-   * of a delegating constructor...
-   */
-  // TODO @minor DSH the destructor gets called too early with this workaround
-  struct _constexpr_14_workaround_tag {};
-  struct _constexpr_14_workaround_no_device_tag {};
-  KOKKOS_FORCEINLINE_FUNCTION
-  constexpr EBOBaseImpl(_constexpr_14_workaround_tag, T&&) noexcept {}
-  inline constexpr EBOBaseImpl(_constexpr_14_workaround_no_device_tag,
-                               T&&) noexcept {}
-
   template <
       class... Args, class _ignored = void,
       typename std::enable_if<std::is_void<_ignored>::value &&
@@ -100,10 +86,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   !CtorNotOnDevice<Args...>::value,
                               int>::type = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
-      Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_tag{},
-                    T(std::forward<Args>(args)...)) {}
+      Args&&...) noexcept {}
 
   template <
       class... Args, class _ignored = void,
@@ -111,11 +94,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   std::is_constructible<T, Args...>::value &&
                                   CtorNotOnDevice<Args...>::value,
                               long>::type = 0>
-  inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept(
-      noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_no_device_tag{},
-                    T(std::forward<Args>(args)...)) {}
+  inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {}
 
   KOKKOS_DEFAULTED_FUNCTION
   constexpr EBOBaseImpl(EBOBaseImpl const&) = default;
@@ -124,19 +103,16 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
+  constexpr T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
 
   KOKKOS_INLINE_FUNCTION
   constexpr T const& _ebo_data_member() const& {
@@ -154,8 +130,9 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& _ebo_data_member() && { return std::move(*reinterpret_cast<T*>(this)); }
+  constexpr T&& _ebo_data_member() && {
+    return std::move(*reinterpret_cast<T*>(this));
+  }
 };
 
 template <class T, template <class...> class CTorsNotOnDevice>
@@ -191,12 +168,10 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
@@ -232,8 +207,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   using ebo_base_t::ebo_base_t;
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& no_unique_address_data_member() & {
+  constexpr T& no_unique_address_data_member() & {
     return this->ebo_base_t::_ebo_data_member();
   }
 
@@ -253,8 +227,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& no_unique_address_data_member() && {
+  constexpr T&& no_unique_address_data_member() && {
     return this->ebo_base_t::_ebo_data_member();
   }
 };
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
index dfb9f3a51c..9c8024cbd0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -138,6 +138,9 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::SYCLMallocShared:
       o << "sycl::malloc_shared().";
       break;
+    case AllocationMechanism::SYCLMallocHost:
+      o << "sycl::malloc_host().";
+      break;
   }
   append_additional_error_information(o);
   o << ")" << std::endl;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
index 5db4597346..dc9bfe2b5a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -97,7 +97,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     HIPMalloc,
     HIPHostMalloc,
     SYCLMallocDevice,
-    SYCLMallocShared
+    SYCLMallocShared,
+    SYCLMallocHost
   };
 
  private:
@@ -218,31 +219,41 @@ KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
 
 #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
     defined(KOKKOS_ENABLE_DEBUG)
-#define KOKKOS_EXPECTS(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Expected precondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_EXPECTS(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Expected precondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-#define KOKKOS_ENSURES(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Ensured postcondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_ENSURES(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Ensured postcondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-// some projects already define this for themselves, so don't mess them up
+// some projects already define this for themselves, so don't mess
+// them up
 #ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)                                             \
-  {                                                                    \
-    if (!bool(__VA_ARGS__)) {                                          \
-      ::Kokkos::abort(                                                 \
-          "Kokkos contract violation:\n  "                             \
-          "  Asserted condition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                  \
+#define KOKKOS_ASSERT(...)                                                     \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Asserted condition `" #__VA_ARGS__                                \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
 #endif  // ifndef KOKKOS_ASSERT
 #else   // not debug mode
diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
index a922e7e3f9..1a0b10e40f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
@@ -55,6 +55,7 @@ class ExecSpaceInitializerBase {
   virtual void initialize(const InitArguments &args)                     = 0;
   virtual void finalize(const bool all_spaces)                           = 0;
   virtual void fence()                                                   = 0;
+  virtual void fence(const std::string &)                                = 0;
   virtual void print_configuration(std::ostream &msg, const bool detail) = 0;
   ExecSpaceInitializerBase()          = default;
   virtual ~ExecSpaceInitializerBase() = default;
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 22e88ebc4f..5de92fc457 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -1335,7 +1334,10 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   using functor_type = FunctorType;
 
   static_assert(
-      IS_VOID || IS_REJECT || 0 == (sizeof(ValueType) % sizeof(int)),
+      IS_VOID || IS_REJECT ||
+          ((sizeof(ValueType) > sizeof(int))
+               ? 0 == sizeof(ValueType) % sizeof(int)
+               : true),
       "Reduction functor's value_type deduced from functor::operator() "
       "requires: 0 == sizeof(value_type) % sizeof(int)");
 
@@ -1902,17 +1904,6 @@ struct FunctorFinalFunction {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type&));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile & ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile & )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile & ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const&) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1925,17 +1916,6 @@ struct FunctorFinalFunction {
                                                         value_type const&));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const&));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile & ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile & ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile & ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile & ) );
 };
 
 // Compatible functions for 'final' function and value_type is an array
@@ -1956,17 +1936,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type*));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile * ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile * )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile * ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const*) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1979,17 +1948,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
                                                         value_type const*));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const*));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile * ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile * ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile * ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile * ) );
 };
 
 template <class FunctorType>
@@ -2109,89 +2067,4 @@ struct FunctorFinal<
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ArgTag,
-          class ReferenceType =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type>
-struct FunctorApplyFunction {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        ReferenceType));
-};
-
-template <class FunctorType, class ReferenceType>
-struct FunctorApplyFunction<FunctorType, void, ReferenceType> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ReferenceType));
-};
-
-template <class FunctorType>
-struct FunctorApplyFunction<FunctorType, void, void> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)() const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)());
-};
-
-template <class FunctorType, class ArgTag, class ReferenceType,
-          class Enable = void>
-struct FunctorApply {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType&, void*) {}
-};
-
-/* 'apply' function provided for void value */
-template <class FunctorType, class ArgTag>
-struct FunctorApply<
-    FunctorType, ArgTag,
-    void
-    // First  substitution failure when FunctorType::apply does not exist.
-    // Second substitution failure when enable_if( & Functor::apply ) does not
-    // exist
-    ,
-    decltype(FunctorApplyFunction<FunctorType, ArgTag, void>::enable_if(
-        &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f) { f.apply(); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f) {
-    f.apply();
-  }
-};
-
-/* 'apply' function provided for single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorApply<FunctorType, ArgTag,
-                    T&
-                    // First  substitution failure when FunctorType::apply does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::apply ) does not exist
-                    ,
-                    decltype(
-                        FunctorApplyFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* KOKKOS_FUNCTORADAPTER_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index a56d19ee72..7140154e0f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -722,14 +721,16 @@ struct FunctorAnalysis {
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<IsArray, FunctorAnalysis::ValueType*>::type
+        typename std::enable_if<IsArray,
+                                typename FunctorAnalysis::ValueType*>::type
         ref() const noexcept {
       return m_result;
     }
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray, FunctorAnalysis::ValueType&>::type
+        typename std::enable_if<!IsArray,
+                                typename FunctorAnalysis::ValueType&>::type
         ref() const noexcept {
       return *m_result;
     }
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
index 97286dd07f..3b7b194db5 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
 
 #include <functional>
 
@@ -92,6 +93,8 @@ class HostSharedPtr {
     // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+    m_control = nullptr;
 #endif
   }
 
@@ -115,6 +118,8 @@ class HostSharedPtr {
       // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
       if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+      m_control = nullptr;
 #endif
     }
     return *this;
@@ -154,6 +159,9 @@ class HostSharedPtr {
     // object pointed to by m_counter and m_element_ptr.
     if (m_control) {
       int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1);
+      // atomic_fetch_sub might have memory order relaxed so we need to force
+      // synchronization to avoid multiple threads doing the cleanup.
+      Kokkos::memory_fence();
       if (count == 1) {
         (m_control->m_deleter)(m_element_ptr);
         m_element_ptr = nullptr;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index 2e5587e4a3..a7f4a652be 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -74,8 +74,8 @@ void HostThreadTeamData::organize_pool(HostThreadTeamData *members[],
     }
 
     {
-      HostThreadTeamData **const pool =
-          (HostThreadTeamData **)(root_scratch + m_pool_members);
+      HostThreadTeamData **const pool = reinterpret_cast<HostThreadTeamData **>(
+          root_scratch + m_pool_members);
 
       // team size == 1, league size == pool_size
 
@@ -136,7 +136,8 @@ int HostThreadTeamData::organize_team(const int team_size) {
     if (team_size == 1) return 1;  // Already organized in teams of one
 
     HostThreadTeamData *const *const pool =
-        (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+        reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                m_pool_members);
 
     // "league_size" in this context is the number of concurrent teams
     // that the pool can accommodate.  Excess threads are idle.
@@ -239,7 +240,8 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (w.first == -1 && m_steal_rank != m_pool_rank) {
       HostThreadTeamData *const *const pool =
-          (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+          reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                  m_pool_members);
 
       // Attempt from beginning failed, try to steal from end of neighbor
 
@@ -287,23 +289,17 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (1 < m_team_size) {
       // Must share the work index
-      *((int volatile *)team_reduce()) = w.first;
+      *reinterpret_cast<int volatile *>(team_reduce()) = w.first;
 
       team_rendezvous_release();
     }
   } else if (1 < m_team_size) {
-    w.first = *((int volatile *)team_reduce());
+    w.first = *reinterpret_cast<int volatile *>(team_reduce());
   }
 
   // May exit because successfully stole work and w is good.
   // May exit because no work left to steal and w = (-1,-1).
 
-#if 0
-fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
-       , m_pool_rank , m_pool_size , w.first );
-fflush(stdout);
-#endif
-
   return w.first;
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index d4cae7f122..0652b55bb7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -91,9 +91,18 @@ class HostThreadTeamData {
   //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
 
   enum : int { m_pool_members = 0 };
-  enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
-  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
-  enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
+  enum : int {
+    m_pool_rendezvous =
+        static_cast<int>(m_pool_members) + static_cast<int>(max_pool_members)
+  };
+  enum : int {
+    m_team_rendezvous = static_cast<int>(m_pool_rendezvous) +
+                        static_cast<int>(max_pool_rendezvous)
+  };
+  enum : int {
+    m_pool_reduce = static_cast<int>(m_team_rendezvous) +
+                    static_cast<int>(max_team_rendezvous)
+  };
 
   using pair_int_t = Kokkos::pair<int64_t, int64_t>;
 
@@ -120,13 +129,13 @@ class HostThreadTeamData {
   int mutable m_team_rendezvous_step;
 
   HostThreadTeamData* team_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch +
-                                   m_pool_members))[m_team_base + r];
+    return (reinterpret_cast<HostThreadTeamData**>(
+        m_pool_scratch + m_pool_members))[m_team_base + r];
   }
 
  public:
   inline bool team_rendezvous() const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != 0) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -138,7 +147,7 @@ class HostThreadTeamData {
   }
 
   inline bool team_rendezvous(const int source_team_rank) const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != source_team_rank) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -150,12 +159,13 @@ class HostThreadTeamData {
   }
 
   inline void team_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
-                               m_team_size, m_team_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), m_team_size,
+        m_team_rendezvous_step);
   }
 
   inline int pool_rendezvous() const noexcept {
-    int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous);
     HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
     if (m_pool_rank != 0) {
       HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
@@ -167,8 +177,9 @@ class HostThreadTeamData {
   }
 
   inline void pool_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
-                               m_pool_size, m_pool_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous), m_pool_size,
+        m_pool_rendezvous_step);
   }
 
   //----------------------------------------
@@ -230,7 +241,8 @@ class HostThreadTeamData {
   constexpr int pool_size() const { return m_pool_size; }
 
   HostThreadTeamData* pool_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
+    return (reinterpret_cast<HostThreadTeamData**>(m_pool_scratch +
+                                                   m_pool_members))[r];
   }
 
   //----------------------------------------
@@ -330,24 +342,11 @@ class HostThreadTeamData {
     team_shared_size = align_to_int64(team_shared_size);
     // thread_local_size = align_to_int64( thread_local_size );
 
-    m_scratch      = (int64_t*)alloc_ptr;
+    m_scratch      = static_cast<int64_t*>(alloc_ptr);
     m_team_reduce  = m_pool_reduce + pool_reduce_size;
     m_team_shared  = m_team_reduce + team_reduce_size;
     m_thread_local = m_team_shared + team_shared_size;
     m_scratch_size = align_to_int64(alloc_size);
-
-#if 0
-fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
-       , int(m_pool_members)
-       , int(m_pool_rendezvous)
-       , int(m_pool_reduce)
-       , int(m_team_reduce)
-       , int(m_team_shared)
-       , int(m_thread_local)
-       , int(m_scratch_size)
-       );
-fflush(stdout);
-#endif
   }
 
   //----------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
index 79aeca5da0..1ed502db5b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
@@ -110,7 +110,7 @@ struct SimpleSinglyLinkedListNode {
   friend struct LinkedListNodeAccess;
 
  public:
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
@@ -118,7 +118,7 @@ struct SimpleSinglyLinkedListNode {
     return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue);
   }
 
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const volatile noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 76d5536019..865d1c47fa 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -48,7 +48,7 @@
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
-
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 KOKKOS_FORCEINLINE_FUNCTION
 void memory_fence() {
 #if defined(__CUDA_ARCH__)
@@ -75,6 +75,7 @@ void memory_fence() {
 #error "Error: memory_fence() not defined"
 #endif
 }
+#endif
 
 //////////////////////////////////////////////////////
 // store_fence()
diff --git a/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
index fe78cfbacc..1c61b73f02 100644
--- a/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -467,7 +466,7 @@ class MultipleTaskQueue final
 
   // TODO @tasking @generalization DSH make this a property-based customization
   // point
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       typename base_t::execution_space const& exec_space,
       typename base_t::memory_space const&,
       typename base_t::memory_pool const&) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 94ea6e1a2b..8505e8f51a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -53,6 +53,7 @@
 #include <array>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <stack>
 #include <unordered_map>
 #include <unordered_set>
@@ -70,7 +71,9 @@ void tool_invoked_fence(const uint32_t /* devID */) {
    * Eventually we want to support fencing only
    * a given stream/resource
    */
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::Tools::Experimental::Impl::tool_invoked_fence: Tool Requested "
+      "Fence");
 }
 }  // namespace Impl
 #ifdef KOKKOS_ENABLE_TUNING
@@ -131,7 +134,8 @@ inline void invoke_kokkosp_callback(
     if (may_require_global_fencing == MayRequireGlobalFencing::Yes &&
         (Kokkos::Tools::Experimental::tool_requirements
              .requires_global_fencing)) {
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Tools::invoke_kokkosp_callback: Kokkos Profile Tool Fence");
     }
     (*callback)(std::forward<Args>(args)...);
   }
@@ -432,18 +436,43 @@ void initialize(const std::string& profileLibrary) {
   if (is_initialized) return;
   is_initialized = 1;
 
+  auto invoke_init_callbacks = []() {
+    Experimental::invoke_kokkosp_callback(
+        Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
+        Kokkos::Tools::Experimental::current_callbacks.init, 0,
+        (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
+
+    Experimental::tool_requirements.requires_global_fencing = true;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.request_tool_settings, 1,
+        &Experimental::tool_requirements);
+
+    Experimental::ToolProgrammingInterface actions;
+    actions.fence = &Experimental::Impl::tool_invoked_fence;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.provide_tool_programming_interface, 1,
+        actions);
+  };
+
 #ifdef KOKKOS_ENABLE_LIBDL
   void* firstProfileLibrary = nullptr;
 
-  if (profileLibrary.empty()) return;
+  if (profileLibrary.empty()) {
+    invoke_init_callbacks();
+    return;
+  }
 
   char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
-  char* envProfileCopy =
-      (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
-  sprintf(envProfileCopy, "%s", envProfileLibrary);
+  const auto envProfileCopy =
+      std::make_unique<char[]>(strlen(envProfileLibrary) + 1);
+  sprintf(envProfileCopy.get(), "%s", envProfileLibrary);
 
-  char* profileLibraryName = strtok(envProfileCopy, ";");
+  char* profileLibraryName = strtok(envProfileCopy.get(), ";");
 
   if ((profileLibraryName != nullptr) &&
       (strcmp(profileLibraryName, "") != 0)) {
@@ -574,25 +603,8 @@ void initialize(const std::string& profileLibrary) {
 #else
   (void)profileLibrary;
 #endif  // KOKKOS_ENABLE_LIBDL
-  Experimental::invoke_kokkosp_callback(
-      Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
-      Kokkos::Tools::Experimental::current_callbacks.init, 0,
-      (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
 
-  Experimental::tool_requirements.requires_global_fencing = true;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.request_tool_settings, 1,
-      &Experimental::tool_requirements);
-
-  Experimental::ToolProgrammingInterface actions;
-  actions.fence = &Experimental::Impl::tool_invoked_fence;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.provide_tool_programming_interface, 1,
-      actions);
+  invoke_init_callbacks();
 
 #ifdef KOKKOS_ENABLE_TUNING
   Experimental::VariableInfo kernel_name;
@@ -656,9 +668,6 @@ void initialize(const std::string& profileLibrary) {
   Experimental::no_profiling.declare_output_type   = nullptr;
   Experimental::no_profiling.request_output_values = nullptr;
   Experimental::no_profiling.end_tuning_context    = nullptr;
-#ifdef KOKKOS_ENABLE_LIBDL
-  free(envProfileCopy);
-#endif
 }
 
 void finalize() {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 1ff6a36c3b..86a4cfa4a8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -50,9 +50,12 @@
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <memory>
+#include <unordered_map>
 #include <map>
 #include <string>
 #include <type_traits>
+#include <mutex>
 namespace Kokkos {
 
 // forward declaration
@@ -135,6 +138,71 @@ Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name);
 
 namespace Experimental {
 
+namespace Impl {
+struct DirectFenceIDHandle {
+  uint32_t value;
+};
+//
+template <typename Space>
+uint32_t idForInstance(const uintptr_t instance) {
+  static std::mutex instance_mutex;
+  const std::lock_guard<std::mutex> lock(instance_mutex);
+  /** Needed to be a ptr due to initialization order problems*/
+  using map_type = std::map<uintptr_t, uint32_t>;
+
+  static std::shared_ptr<map_type> map;
+  if (map.get() == nullptr) {
+    map = std::make_shared<map_type>(map_type());
+  }
+
+  static uint32_t value = 0;
+  constexpr const uint32_t offset =
+      Kokkos::Tools::Experimental::NumReservedDeviceIDs;
+
+  auto find = map->find(instance);
+  if (find == map->end()) {
+    auto ret         = offset + value++;
+    (*map)[instance] = ret;
+    return ret;
+  }
+
+  return find->second;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(const std::string& name, DirectFenceIDHandle devIDTag,
+                         const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name,
+      Kokkos::Tools::Experimental::device_id_root<Space>() + devIDTag.value,
+      &handle);
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+
+inline uint32_t int_for_synchronization_reason(
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) {
+  switch (reason) {
+    case GlobalDeviceSynchronization: return 0;
+    case DeepCopyResourceSynchronization: return 0x00ffffff;
+  }
+  return 0;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(
+    const std::string& name,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name, device_id_root<Space>() + int_for_synchronization_reason(reason),
+      &handle);  // TODO: correct ID
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+}  // namespace Impl
 void set_init_callback(initFunction callback);
 void set_finalize_callback(finalizeFunction callback);
 void set_parse_args_callback(parseArgsFunction callback);
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
index ed8751c50c..2c8d1428fc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -54,7 +54,7 @@
 #include <stdbool.h>
 #endif
 
-#define KOKKOSP_INTERFACE_VERSION 20210225
+#define KOKKOSP_INTERFACE_VERSION 20210623
 
 // Profiling
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 7809632f78..a7aec2e6fd 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -56,6 +56,14 @@
 namespace Kokkos {
 namespace Tools {
 namespace Experimental {
+
+constexpr const uint32_t NumReservedDeviceIDs = 1;
+
+enum SpecialSynchronizationCases : int {
+  GlobalDeviceSynchronization     = 1,
+  DeepCopyResourceSynchronization = 2,
+};
+
 enum struct DeviceType {
   Serial,
   OpenMP,
@@ -68,15 +76,49 @@ enum struct DeviceType {
   Unknown
 };
 
+struct ExecutionSpaceIdentifier {
+  DeviceType type;
+  uint32_t device_id;
+  uint32_t instance_id;
+};
+inline DeviceType devicetype_from_uint32t(const uint32_t in) {
+  switch (in) {
+    case 0: return DeviceType::Serial;
+    case 1: return DeviceType::OpenMP;
+    case 2: return DeviceType::Cuda;
+    case 3: return DeviceType::HIP;
+    case 4: return DeviceType::OpenMPTarget;
+    case 5: return DeviceType::HPX;
+    case 6: return DeviceType::Threads;
+    case 7: return DeviceType::SYCL;
+    default: return DeviceType::Unknown;  // TODO: error out?
+  }
+}
+
+inline ExecutionSpaceIdentifier identifier_from_devid(const uint32_t in) {
+  // ExecutionSpaceIdentifier out;
+  // out.type = in >> 24;
+  // out.device_id = in >> 17;
+  // out.instance_id = ((uint32_t(-1)) << 17 ) & in;
+  return {devicetype_from_uint32t(in >> 24),
+          (~((uint32_t(-1)) << 24)) & (in >> 17),
+          (~((uint32_t(-1)) << 17)) & in};
+}
+
 template <typename ExecutionSpace>
 struct DeviceTypeTraits;
 
 constexpr const size_t device_type_bits = 8;
 constexpr const size_t instance_bits    = 24;
 template <typename ExecutionSpace>
+constexpr uint32_t device_id_root() {
+  constexpr auto device_id =
+      static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
+  return (device_id << instance_bits);
+}
+template <typename ExecutionSpace>
 inline uint32_t device_id(ExecutionSpace const& space) noexcept {
-  auto device_id = static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
-  return (device_id << instance_bits) + space.impl_instance_id();
+  return device_id_root<ExecutionSpace>() + space.impl_instance_id();
 }
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
new file mode 100644
index 0000000000..b67cede45b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
@@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QUAD_PRECISION_MATH_HPP
+#define KOKKOS_QUAD_PRECISION_MATH_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+
+#include <Kokkos_NumericTraits.hpp>
+
+#include <quadmath.h>
+
+#if !(defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+#error __float128 not supported on this host
+#endif
+
+//<editor-fold desc="numeric traits __float128 specializations">
+namespace Kokkos {
+namespace Experimental {
+#if defined(KOKKOS_ENABLE_CXX17)
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };                                                                         \
+  template <>                                                                \
+  inline constexpr auto TRAIT##_v<TYPE> = TRAIT<TYPE>::value;
+#else
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };
+#endif
+
+// clang-format off
+// Numeric distinguished value traits
+// Workaround GCC bug https://godbolt.org/z/qWb5oe4dx
+// error: '__builtin_huge_valq()' is not a constant expression
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 710)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(infinity,       __float128, __float128, HUGE_VALQ)
+#endif
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_min,     __float128, __float128, -FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_max,     __float128, __float128, FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(epsilon,        __float128, __float128, FLT128_EPSILON)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(round_error,    __float128, __float128, static_cast<__float128>(0.5))
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(norm_min,       __float128, __float128, FLT128_MIN)
+
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits,         __float128,        int, FLT128_MANT_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits10,       __float128,        int, FLT128_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_digits10,   __float128,        int, 36)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(radix,          __float128,        int, 2)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent,   __float128,        int, FLT128_MIN_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent,   __float128,        int, FLT128_MAX_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent10, __float128,        int, FLT128_MIN_10_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent10, __float128,        int, FLT128_MAX_10_EXP)
+// clang-format on
+
+#undef KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+namespace Kokkos {
+template <>
+struct reduction_identity<__float128> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 sum() {
+    return static_cast<__float128>(0.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 prod() {
+    return static_cast<__float128>(1.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 max() {
+    return -FLT128_MAX;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 min() {
+    return FLT128_MAX;
+  }
+};
+}  // namespace Kokkos
+
+//<editor-fold desc="Common mathematical functions __float128 overloads">
+namespace Kokkos {
+namespace Experimental {
+// clang-format off
+// Basic operations
+inline __float128 fabs(__float128 x) { return ::fabsq(x); }
+inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); }
+inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); }
+inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
+inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); }
+inline __float128 fdim(__float128 x, __float128 y) { return ::fdimq(x, y); }
+inline __float128 nanq(char const* arg) { return ::nanq(arg); }
+// Power functions
+inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
+inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
+inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
+inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
+// Exponential functions
+inline __float128 exp(__float128 x) { return ::expq(x); }
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 910)
+inline __float128 exp2(__float128 x) { return ::exp2q(x); }
+#endif
+inline __float128 expm1(__float128 x) { return ::expm1q(x); }
+inline __float128 log(__float128 x) { return ::logq(x); }
+inline __float128 log10(__float128 x) { return ::log10q(x); }
+inline __float128 log2(__float128 x) { return ::log2q(x); }
+inline __float128 log1p(__float128 x) { return ::log1pq(x); }
+// Trigonometric functions
+inline __float128 sin(__float128 x) { return ::sinq(x); }
+inline __float128 cos(__float128 x) { return ::cosq(x); }
+inline __float128 tan(__float128 x) { return ::tanq(x); }
+inline __float128 asin(__float128 x) { return ::asinq(x); }
+inline __float128 acos(__float128 x) { return ::acosq(x); }
+inline __float128 atan(__float128 x) { return ::atanq(x); }
+inline __float128 atan2(__float128 x, __float128 y) { return ::atan2q(x, y); }
+// Hyperbolic functions
+inline __float128 sinh(__float128 x) { return ::sinhq(x); }
+inline __float128 cosh(__float128 x) { return ::coshq(x); }
+inline __float128 tanh(__float128 x) { return ::tanhq(x); }
+inline __float128 asinh(__float128 x) { return ::asinhq(x); }
+inline __float128 acosh(__float128 x) { return ::acoshq(x); }
+inline __float128 atanh(__float128 x) { return ::atanhq(x); }
+// Error and gamma functions
+inline __float128 erf(__float128 x) { return ::erfq(x); }
+inline __float128 erfc(__float128 x) { return ::erfcq(x); }
+inline __float128 tgamma(__float128 x) { return ::tgammaq(x); }
+inline __float128 lgamma(__float128 x) { return ::lgammaq(x); }
+// Nearest integer floating point operations
+inline __float128 ceil(__float128 x) { return ::ceilq(x); }
+inline __float128 floor(__float128 x) { return ::floorq(x); }
+inline __float128 trunc(__float128 x) { return ::truncq(x); }
+inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); }
+// Classification and comparison
+inline bool isfinite(__float128 x) { return !::isinfq(x); }  // isfiniteq not provided
+inline bool isinf(__float128 x) { return ::isinfq(x); }
+inline bool isnan(__float128 x) { return ::isnanq(x); }
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
index 4bd0379065..c49e838d8f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -58,28 +58,59 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace {
 
-HostThreadTeamData g_serial_thread_team_data;
+bool SerialInternal::is_initialized() { return m_is_initialized; }
 
-bool g_serial_is_initialized = false;
+void SerialInternal::initialize() {
+  if (is_initialized()) return;
 
-}  // namespace
+  Impl::SharedAllocationRecord<void, void>::tracking_enable();
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  m_is_initialized = true;
+}
+
+void SerialInternal::finalize() {
+  if (m_thread_team_data.scratch_buffer()) {
+    m_thread_team_data.disband_team();
+    m_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space;
+
+    space.deallocate(m_thread_team_data.scratch_buffer(),
+                     m_thread_team_data.scratch_bytes());
+
+    m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
+  }
+
+  Kokkos::Profiling::finalize();
+
+  m_is_initialized = false;
+}
+
+SerialInternal& SerialInternal::singleton() {
+  static SerialInternal* self = nullptr;
+  if (!self) {
+    self = new SerialInternal();
+  }
+  return *self;
+}
 
 // Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes) {
+void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes,
+                                             size_t team_reduce_bytes,
+                                             size_t team_shared_bytes,
+                                             size_t thread_local_bytes) {
   if (pool_reduce_bytes < 512) pool_reduce_bytes = 512;
   if (team_reduce_bytes < 512) team_reduce_bytes = 512;
 
-  const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
-  const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
-  const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
-  const size_t old_thread_local =
-      g_serial_thread_team_data.thread_local_bytes();
-  const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
+  const size_t old_pool_reduce  = m_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = m_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = m_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = m_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = m_thread_team_data.scratch_bytes();
 
   // Allocate if any of the old allocation is tool small:
 
@@ -92,12 +123,12 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
     Kokkos::HostSpace space;
 
     if (old_alloc_bytes) {
-      g_serial_thread_team_data.disband_team();
-      g_serial_thread_team_data.disband_pool();
+      m_thread_team_data.disband_team();
+      m_thread_team_data.disband_pool();
 
       space.deallocate("Kokkos::Serial::scratch_mem",
-                       g_serial_thread_team_data.scratch_buffer(),
-                       g_serial_thread_team_data.scratch_bytes());
+                       m_thread_team_data.scratch_buffer(),
+                       m_thread_team_data.scratch_bytes());
     }
 
     if (pool_reduce_bytes < old_pool_reduce) {
@@ -125,56 +156,37 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
       Kokkos::Impl::throw_runtime_exception(failure.get_error_message());
     }
 
-    g_serial_thread_team_data.scratch_assign(
-        ((char*)ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes,
-        team_shared_bytes, thread_local_bytes);
+    m_thread_team_data.scratch_assign(static_cast<char*>(ptr), alloc_bytes,
+                                      pool_reduce_bytes, team_reduce_bytes,
+                                      team_shared_bytes, thread_local_bytes);
 
-    HostThreadTeamData* pool[1] = {&g_serial_thread_team_data};
+    HostThreadTeamData* pool[1] = {&m_thread_team_data};
 
-    g_serial_thread_team_data.organize_pool(pool, 1);
-    g_serial_thread_team_data.organize_team(1);
+    m_thread_team_data.organize_pool(pool, 1);
+    m_thread_team_data.organize_team(1);
   }
 }
-
-HostThreadTeamData* serial_get_thread_team_data() {
-  return &g_serial_thread_team_data;
-}
-
 }  // namespace Impl
-}  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
+Serial::Serial()
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    : m_space_instance(&Impl::SerialInternal::singleton()) {
+}
+#else
+    : m_space_instance(&Impl::SerialInternal::singleton(),
+                       [](Impl::SerialInternal*) {}) {
+}
+#endif
 
-namespace Kokkos {
-
-bool Serial::impl_is_initialized() { return Impl::g_serial_is_initialized; }
+bool Serial::impl_is_initialized() {
+  return Impl::SerialInternal::singleton().is_initialized();
+}
 
 void Serial::impl_initialize() {
-  Impl::SharedAllocationRecord<void, void>::tracking_enable();
-
-  // Init the array of locks used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-  Impl::g_serial_is_initialized = true;
+  Impl::SerialInternal::singleton().initialize();
 }
 
-void Serial::impl_finalize() {
-  if (Impl::g_serial_thread_team_data.scratch_buffer()) {
-    Impl::g_serial_thread_team_data.disband_team();
-    Impl::g_serial_thread_team_data.disband_pool();
-
-    Kokkos::HostSpace space;
-
-    space.deallocate(Impl::g_serial_thread_team_data.scratch_buffer(),
-                     Impl::g_serial_thread_team_data.scratch_bytes());
-
-    Impl::g_serial_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
-  }
-
-  Kokkos::Profiling::finalize();
-
-  Impl::g_serial_is_initialized = false;
-}
+void Serial::impl_finalize() { Impl::SerialInternal::singleton().finalize(); }
 
 const char* Serial::name() { return "Serial"; }
 
@@ -198,6 +210,9 @@ void SerialSpaceInitializer::finalize(const bool) {
 }
 
 void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); }
+void SerialSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Serial::impl_static_fence(name);
+}
 
 void SerialSpaceInitializer::print_configuration(std::ostream& msg,
                                                  const bool detail) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 3ac3899aca..be732f4486 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -76,14 +76,18 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
   static void execute(scheduler_type const& scheduler) {
     using task_base_type = typename scheduler_type::task_base_type;
 
-    // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    auto const& serial_execution_space = scheduler.get_execution_space();
 
-    Impl::HostThreadTeamData& self = *Impl::serial_get_thread_team_data();
+    // Set default buffers
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
+
+    auto& self = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
     auto& queue         = scheduler.queue();
     auto team_scheduler = scheduler.get_team_scheduler(0);
@@ -147,9 +151,11 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    execution_space serial_execution_space;
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until no runnable task
 
@@ -181,18 +187,22 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
+    execution_space serial_execution_space;
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
     auto* const queue = scheduler.m_queue;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until all queues are empty
     while (0 < queue->m_ready_count) {
@@ -210,16 +220,6 @@ class TaskQueueSpecializationConstrained<
 
         (*task->m_apply)(task, &exec);
 
-#if 0
-        printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
-        , uintptr_t(task)
-        , uintptr_t(task->m_wait)
-        , uintptr_t(task->m_next)
-        , task->m_task_type
-        , task->m_priority
-        , task->m_ref_count );
-#endif
-
         // If a respawn then re-enqueue otherwise the task is complete
         // and all tasks waiting on this task are updated.
         queue->complete(task);
diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index 917ae72081..3efff98e45 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -259,6 +259,9 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<
     while ((root_next = Kokkos::atomic_exchange(&arg_record->m_root->m_next,
                                                 zero)) == nullptr)
       ;
+    // We need a memory_fence() here so that the following update
+    // is properly sequenced
+    Kokkos::memory_fence();
 
     arg_record->m_next->m_prev = arg_record->m_prev;
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 0773a0914b..7f222c92ca 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
index a0eccffb62..0584cd29eb 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
deleted file mode 100644
index eea4c93866..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TAGS_HPP
-#define KOKKOS_TAGS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <Kokkos_Core_fwd.hpp>
-#include <type_traits>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** KOKKOS_IMPL_HAS_TYPE( Type )
- *
- * defines a meta-function that check if a type expose an internal alias which
- * matches Type
- *
- * e.g.
- *   KOKKOS_IMPL_HAS_TYPE( array_layout );
- *   struct Foo { using array_layout = void; };
- *   have_array_layout<Foo>::value == 1;
- */
-#define KOKKOS_IMPL_HAS_TYPE(TYPE)                                             \
-  template <typename T>                                                        \
-  struct have_##TYPE {                                                         \
-   private:                                                                    \
-    template <typename U, typename = void>                                     \
-    struct X : std::false_type {};                                             \
-    template <typename U>                                                      \
-    struct X<U, typename std::conditional<true, void, typename X::TYPE>::type> \
-        : std::true_type {};                                                   \
-                                                                               \
-   public:                                                                     \
-    using type = typename X<T>::type;                                          \
-    enum : bool { value = type::value };                                       \
-  };
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename T>
-using is_void = std::is_same<void, T>;
-
-}
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index 2d0f62a563..06581052a8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -203,14 +203,17 @@ class TaskBase {
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
-    if (lock != Kokkos::atomic_exchange(&m_next, dep)) {
+    if (lock != Kokkos::Impl::desul_atomic_exchange(
+                    &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(),
+                    Kokkos::Impl::MemoryScopeDevice())) {
       Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
     }
-
     if (nullptr != dep) {
       // The future may be destroyed upon returning from this call
       // so increment reference count to track this assignment.
-      Kokkos::atomic_increment(&(dep->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count),
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
   }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 42afa93cdc..caf1d0a84b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -151,6 +151,7 @@ class ReferenceCountedBase {
   bool decrement_and_check_reference_count() {
     // TODO @tasking @memory_order DSH memory order
     auto old_count = Kokkos::atomic_fetch_add(&m_ref_count, -1);
+    Kokkos::memory_fence();
 
     KOKKOS_ASSERT(old_count > 0 && "reference count greater less than zero!");
 
@@ -158,7 +159,11 @@ class ReferenceCountedBase {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void increment_reference_count() { Kokkos::atomic_increment(&m_ref_count); }
+  void increment_reference_count() {
+    Kokkos::Impl::desul_atomic_inc(&m_ref_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
+  }
 };
 
 template <class TaskQueueTraits, class SchedulingInfo>
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index c0d2eca9c1..e74e84a2e5 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -58,8 +58,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -188,25 +187,11 @@ class TaskQueue : public TaskQueueBase {
   // Assign task pointer with reference counting of assigned tasks
   KOKKOS_FUNCTION static void assign(task_root_type** const lhs,
                                      task_root_type* const rhs) {
-#if 0
-  {
-    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
-          , uintptr_t( lhs ? *lhs : 0 )
-          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
-          , uintptr_t(rhs)
-          , uintptr_t( rhs ? rhs->m_next : 0 )
-          , int( rhs ? rhs->m_task_type : 0 )
-          , int( rhs ? rhs->m_ref_count : 0 )
-          );
-    fflush( stdout );
-  }
-#endif
-
     if (*lhs) decrement(*lhs);
     if (rhs) {
-      Kokkos::atomic_increment(&(rhs->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count,
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
 
     // Force write of *lhs
@@ -234,13 +219,7 @@ class TaskQueue : public TaskQueueBase {
 
     using task_type = Impl::Task<execution_space, value_type, FunctorType>;
 
-    enum : size_t { align = (1 << 4), align_mask = align - 1 };
-    enum : size_t { task_size = sizeof(task_type) };
-    enum : size_t { result_size = Impl::TaskResult<value_type>::size };
-    enum : size_t {
-      alloc_size = ((task_size + align_mask) & ~align_mask) +
-                   ((result_size + align_mask) & ~align_mask)
-    };
+    constexpr size_t task_size = sizeof(task_type);
 
     return m_memory.allocate_block_size(task_size);
   }
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index cae06d4ea5..757e5f9886 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -57,8 +57,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -88,6 +87,7 @@ class TaskQueueCommonMixin {
   // <editor-fold desc="Constructors, destructor, and assignment"> {{{2
 
   TaskQueueCommonMixin() : m_ready_count(0) {
+    Kokkos::memory_fence();
     // TODO @tasking @memory_order DSH figure out if I need this store to be
     // atomic
   }
@@ -158,14 +158,17 @@ class TaskQueueCommonMixin {
   KOKKOS_INLINE_FUNCTION
   void _increment_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_increment(&this->m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
   KOKKOS_INLINE_FUNCTION
   void _decrement_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_decrement(&this->m_ready_count);
-    Kokkos::memory_fence();
+    Kokkos::Impl::desul_atomic_dec(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
  public:
@@ -476,7 +479,7 @@ class TaskQueueCommonMixin {
   }
 
   template <class ExecutionSpace, class MemorySpace, class MemoryPool>
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       ExecutionSpace const&, MemorySpace const&, MemoryPool const&)
   // requires Same<ExecutionSpace, typename Derived::execution_space>
   //            && Same<MemorySpace, typename Derived::memory_space>
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index 6e2481f935..3a71aa17e6 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -56,8 +56,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -103,8 +102,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
     } else {
       void* data = m_pool.allocate(static_cast<size_t>(requested_size));
 
-      // Kokkos::atomic_increment(&m_accum_alloc); // memory_order_relaxed
-      Kokkos::atomic_increment(&m_count_alloc);  // memory_order_relaxed
+      Kokkos::Impl::desul_atomic_inc(
+          &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+          Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
       // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just
       // an approximation, which is probably fine...)
       if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc;
@@ -200,7 +200,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
   KOKKOS_INLINE_FUNCTION void deallocate(
       PoolAllocatedObjectBase<CountType>&& obj) {
     m_pool.deallocate((void*)&obj, 1);
-    Kokkos::atomic_decrement(&m_count_alloc);  // memory_order_relaxed
+    Kokkos::Impl::desul_atomic_dec(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index efee3d051d..5f98e8d85e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -59,9 +59,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
-#include <impl/Kokkos_Atomic_Decrement.hpp>
+#include <Kokkos_Atomic.hpp>
 
 #include <string>
 #include <typeinfo>
@@ -159,8 +157,14 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
               // task stolen.
               // first increment our ready count, then decrement the ready count
               // on the other queue:
-              Kokkos::atomic_increment(&this->m_ready_count);
-              Kokkos::atomic_decrement(&steal_from.m_ready_count);
+              Kokkos::Impl::desul_atomic_inc(
+                  &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
+              Kokkos::Impl::desul_atomic_dec(
+                  &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
               return rv;
             }
           }
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index a87e5f7272..324227cf5e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -105,6 +105,7 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::decrement(
   task_root_type volatile &t = *task;
 
   const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count), -1);
+  Kokkos::memory_fence();
 
 #if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   if (1 == count) {
@@ -146,8 +147,9 @@ KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) {
   void *const p = m_memory.allocate(n);
 
   if (p) {
-    // Kokkos::atomic_increment( & m_accum_alloc );
-    Kokkos::atomic_increment(&m_count_alloc);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
   }
@@ -159,7 +161,9 @@ template <typename ExecSpace, typename MemorySpace>
 KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p,
                                                                    size_t n) {
   m_memory.deallocate(p, n);
-  Kokkos::atomic_decrement(&m_count_alloc);
+  Kokkos::Impl::desul_atomic_dec(
+      &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+      Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 }
 
 //----------------------------------------------------------------------------
@@ -210,7 +214,9 @@ KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task(
     //     *queue = task;
     //   }
     //   old_head = *queue;
-    old_head = Kokkos::atomic_compare_exchange(queue, old_head, task);
+    old_head = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), old_head, task,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (old_head_tmp == old_head) return true;
   }
@@ -258,7 +264,10 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
 
     task_root_type *const x = task;
 
-    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    //    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    task = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), x, lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x == task) {
       // CAS succeeded and queue is locked
@@ -274,6 +283,8 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
       // This thread has exclusive access to
       // the queue and the popped task's m_next.
 
+      Kokkos::memory_fence();
+
       task_root_type *volatile &next = task->m_next;
 
       // This algorithm is not lockfree because a adversarial scheduler could
@@ -400,7 +411,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable(
     // to track number of ready + executing tasks.
     // The ready count will be decremented when the task is complete.
 
-    Kokkos::atomic_increment(&m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     task_root_type *volatile *const ready_queue =
         &m_ready[t.m_priority][t.m_task_type];
@@ -553,8 +566,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule(
 
   task_root_type *const zero = nullptr;
   task_root_type *const lock = (task_root_type *)task_root_type::LockTag;
-
-  if (lock != Kokkos::atomic_exchange(&task->m_next, zero)) {
+  if (lock != Kokkos::Impl::desul_atomic_exchange(
+                  &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice())) {
     Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
   }
 }
@@ -601,8 +615,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
 
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
-
-    task_root_type *x = Kokkos::atomic_exchange(&t.m_wait, lock);
+    task_root_type *x = Kokkos::Impl::desul_atomic_exchange(
+        const_cast<task_root_type **>(&t.m_wait), lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x != (task_root_type *)lock) {
       // This thread has transitioned this 'task' to complete.
@@ -645,7 +660,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
     // A runnable task was popped from a ready queue and executed.
     // If respawned into a ready queue then the ready count was incremented
     // so decrement whether respawned or not.
-    Kokkos::atomic_decrement(&m_ready_count);
+    Kokkos::Impl::desul_atomic_dec(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
index 2faab57949..f53dfe5a96 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
index e8004ff852..6edf571d78 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -45,8 +45,13 @@
 #ifndef KOKKOS_IMPLWALLTIME_HPP
 #define KOKKOS_IMPLWALLTIME_HPP
 
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING("This file is deprecated. Use <Kokkos_Timer.hpp> instead.")
+
 #include <Kokkos_Timer.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
@@ -54,10 +59,11 @@ namespace Impl {
  *   Timer promoted from Impl to Kokkos ns
  *   This file included for backwards compatibility
  */
-
-using Kokkos::Timer;
+using Timer KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::Timer instead!") =
+    Kokkos::Timer;
 
 }  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
index cb8cf281ae..bea7c2c9d1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -65,13 +65,6 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
-struct not_a_type {
-  not_a_type()                  = delete;
-  ~not_a_type()                 = delete;
-  not_a_type(not_a_type const&) = delete;
-  void operator=(not_a_type const&) = delete;
-};
-
 #if defined(__cpp_lib_void_t)
 // since C++17
 using std::void_t;
@@ -158,6 +151,112 @@ struct destruct_delete {
 template <class...>
 struct type_list;
 
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_remove_first"> {{{2
+
+// Currently linear complexity; if we use this a lot, maybe make it better?
+
+template <class Entry, class InList, class OutList>
+struct _type_list_remove_first_impl;
+
+template <class Entry, class T, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<T, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<Ts...>,
+                                   type_list<OutTs..., T>> {};
+
+template <class Entry, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<Entry, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<>,
+                                   type_list<OutTs..., Ts...>> {};
+
+template <class Entry, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<>, type_list<OutTs...>>
+    : identity<type_list<OutTs...>> {};
+
+template <class Entry, class List>
+struct type_list_remove_first
+    : _type_list_remove_first_impl<Entry, List, type_list<>> {};
+
+// </editor-fold> end type_list_remove_first }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_any"> {{{2
+
+template <template <class> class UnaryPred, class List>
+struct type_list_any;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class> class UnaryPred, class... Ts>
+struct type_list_any<UnaryPred, type_list<Ts...>>
+    : std::bool_constant<(UnaryPred<Ts>::value || ...)> {};
+#else
+template <template <class> class UnaryPred, class T, class... Ts>
+struct type_list_any<UnaryPred, type_list<T, Ts...>> {
+  using type = typename std::conditional_t<
+      UnaryPred<T>::value, std::true_type,
+      type_list_any<UnaryPred, type_list<Ts...>>>::type;
+  static constexpr auto value = type::value;
+};
+
+template <template <class> class UnaryPred>
+struct type_list_any<UnaryPred, type_list<>> : std::false_type {};
+
+#endif
+
+// </editor-fold> end type_list_any }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="concat_type_list"> {{{2
+//  concat_type_list combines types in multiple type_lists
+
+// forward declaration
+template <typename... T>
+struct concat_type_list;
+
+// alias
+template <typename... T>
+using concat_type_list_t = typename concat_type_list<T...>::type;
+
+// final instantiation
+template <typename... T>
+struct concat_type_list<type_list<T...>> {
+  using type = type_list<T...>;
+};
+
+// combine consecutive type_lists
+template <typename... T, typename... U, typename... Tail>
+struct concat_type_list<type_list<T...>, type_list<U...>, Tail...>
+    : concat_type_list<type_list<T..., U...>, Tail...> {};
+// </editor-fold> end concat_type_list }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="filter_type_list"> {{{2
+//  filter_type_list generates type-list of types which satisfy
+//  PredicateT<T>::value == ValueT
+
+template <template <typename> class PredicateT, typename TypeListT,
+          bool ValueT = true>
+struct filter_type_list;
+
+template <template <typename> class PredicateT, typename... T, bool ValueT>
+struct filter_type_list<PredicateT, type_list<T...>, ValueT> {
+  using type =
+      concat_type_list_t<std::conditional_t<PredicateT<T>::value == ValueT,
+                                            type_list<T>, type_list<>>...>;
+};
+
+template <template <typename> class PredicateT, typename T, bool ValueT = true>
+using filter_type_list_t =
+    typename filter_type_list<PredicateT, T, ValueT>::type;
+
+// </editor-fold> end filter_type_list }}}2
+//------------------------------------------------------------------------------
+
 // </editor-fold> end type_list }}}1
 //==============================================================================
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
index 41607a2a8e..ace826dd5a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
@@ -130,20 +130,20 @@ struct ObjectWithVLAEmulation {
   // CRTP boilerplate
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived* _this() noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived const* _this() const noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type* _vla_pointer() noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type*>(_this() + 1);
@@ -151,7 +151,7 @@ struct ObjectWithVLAEmulation {
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type const* _vla_pointer() const noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type const*>(_this() + 1);
@@ -159,7 +159,7 @@ struct ObjectWithVLAEmulation {
 
  public:
   KOKKOS_INLINE_FUNCTION
-  static /* KOKKOS_CONSTEXPR_14 */ size_t required_allocation_size(
+  static /* constexpr */ size_t required_allocation_size(
       vla_entry_count_type num_vla_entries) {
     KOKKOS_EXPECTS(num_vla_entries >= 0);
     return sizeof(Derived) + num_vla_entries * sizeof(VLAValueType);
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index b9e32a04e0..797b3f584b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -144,10 +144,10 @@ struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
 };
 
 template <typename Space>
-struct ViewCtorProp<typename std::enable_if<
-                        Kokkos::Impl::is_memory_space<Space>::value ||
-                        Kokkos::Impl::is_execution_space<Space>::value>::type,
-                    Space> {
+struct ViewCtorProp<
+    typename std::enable_if<Kokkos::is_memory_space<Space>::value ||
+                            Kokkos::is_execution_space<Space>::value>::type,
+    Space> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -207,10 +207,10 @@ template <typename... P>
 struct ViewCtorProp : public ViewCtorProp<void, P>... {
  private:
   using var_memory_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_memory_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_memory_space, P...>;
 
   using var_execution_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_execution_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_execution_space, P...>;
 
   struct VOIDDUMMY {};
 
@@ -270,7 +270,6 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
 
 namespace Kokkos {
 
-/* For backward compatibility */
 namespace Impl {
 struct ViewAllocateWithoutInitializingBackwardCompat {};
 
@@ -291,7 +290,6 @@ struct ViewCtorProp<WithoutInitializing_t, std::string,
 };
 } /* namespace Impl */
 
-/*[[deprecated(Use Kokkos::alloc(Kokkos::WithoutInitializing, label) instead]]*/
 using ViewAllocateWithoutInitializing =
     Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string,
                        Impl::ViewAllocateWithoutInitializingBackwardCompat>;
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index a380a30693..9523118748 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -49,6 +49,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_Extents.hpp>
@@ -862,7 +863,7 @@ struct ViewDataAnalysis {
 namespace Kokkos {
 namespace Impl {
 
-template <class Dimension, class Layout, typename Enable = void>
+template <class Dimension, class Layout, class Enable = void>
 struct ViewOffset {
   using is_mapping_plugin = std::false_type;
 };
@@ -1389,7 +1390,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2022,7 +2024,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2816,6 +2819,22 @@ struct ViewDataHandle<
 namespace Kokkos {
 namespace Impl {
 
+template <typename T>
+inline bool is_zero_byte(const T& t) {
+  using comparison_type = std::conditional_t<
+      sizeof(T) % sizeof(long long int) == 0, long long int,
+      std::conditional_t<
+          sizeof(T) % sizeof(long int) == 0, long int,
+          std::conditional_t<
+              sizeof(T) % sizeof(int) == 0, int,
+              std::conditional_t<sizeof(T) % sizeof(short int) == 0, short int,
+                                 char>>>>;
+  const auto* const ptr = reinterpret_cast<const comparison_type*>(&t);
+  for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i)
+    if (ptr[i] != 0) return false;
+  return true;
+}
+
 //----------------------------------------------------------------------------
 
 /*
@@ -2826,16 +2845,16 @@ namespace Impl {
  *  called from the shared memory tracking destruction.
  *  Secondarily to have two fewer partial specializations.
  */
-template <class ExecSpace, class ValueType,
+template <class DeviceType, class ValueType,
           bool IsScalar = std::is_scalar<ValueType>::value>
 struct ViewValueFunctor;
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
-  using Exec       = typename ExecSpace::execution_space;
 
-  Exec space;
+  ExecSpace space;
   ValueType* ptr;
   size_t n;
   bool destroy;
@@ -2864,11 +2883,50 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         destroy(false),
         name(std::move(arg_name)) {}
 
-  void execute(bool arg) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<ValueType>::value>
+  construct_dispatch() {
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
+      uint64_t kpID = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
+        Kokkos::Profiling::beginParallelFor(
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation(false);
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<ValueType>::value)>
+  construct_dispatch() {
+    parallel_for_implementation(false);
+  }
+
+  void parallel_for_implementation(bool arg) {
     destroy = arg;
-    PolicyType policy(0, n);
-    std::string functor_name;
     if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name;
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         functor_name =
@@ -2877,6 +2935,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
                                                 kpID);
       }
+
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
         Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
@@ -2886,7 +2945,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, policy);
       closure.execute();
-      space.fence();
+      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
                                               kpID);
@@ -2896,13 +2955,14 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
     }
   }
 
-  void construct_shared_allocation() { execute(false); }
+  void construct_shared_allocation() { construct_dispatch(); }
 
-  void destroy_shared_allocation() { execute(true); }
+  void destroy_shared_allocation() { parallel_for_implementation(true); }
 };
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
 
   ExecSpace space;
@@ -2921,12 +2981,54 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
                    size_t const arg_n, std::string arg_name)
       : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {}
 
-  void construct_shared_allocation() {
-    if (!space.in_parallel()) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<Dummy>::value>
+  construct_shared_allocation() {
+    // Shortcut for zero initialization
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
         Kokkos::Profiling::beginParallelFor(
-            "Kokkos::View::initialization [" + name + "]", 0, &kpID);
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation();
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<Dummy>::value)>
+  construct_shared_allocation() {
+    parallel_for_implementation();
+  }
+
+  void parallel_for_implementation() {
+    if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name = "Kokkos::View::initialization [" + name + "]";
+      uint64_t kpID            = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
+                                                kpID);
       }
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
@@ -2937,9 +3039,11 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, PolicyType(0, n));
       closure.execute();
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in view");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
+        Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
+                                              kpID);
       }
     } else {
       for (size_t i = 0; i < n; ++i) operator()(i);
@@ -3232,7 +3336,9 @@ class ViewMapping<
     using execution_space = typename alloc_prop::execution_space;
     using memory_space    = typename Traits::memory_space;
     using value_type      = typename Traits::value_type;
-    using functor_type    = ViewValueFunctor<execution_space, value_type>;
+    using functor_type =
+        ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
+                         value_type>;
     using record_type =
         Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;
 
@@ -3314,17 +3420,10 @@ class ViewMapping<
                            Kokkos::LayoutStride>::value))))>::type> {
  private:
   enum {
-    is_assignable_space =
-#if 1
-        Kokkos::Impl::MemorySpaceAccess<
-            typename DstTraits::memory_space,
-            typename SrcTraits::memory_space>::assignable
+    is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
+        typename DstTraits::memory_space,
+        typename SrcTraits::memory_space>::assignable
   };
-#else
-        std::is_same<typename DstTraits::memory_space,
-                     typename SrcTraits::memory_space>::value
-  };
-#endif
 
   enum {
     is_assignable_value_type =
@@ -3728,7 +3827,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -3842,24 +3941,21 @@ struct OperatorBoundsErrorOnDevice<MapType, true> {
    this defined by default.
    The existence of this alias indicates the existence of MapType::is_managed
  */
-template <class T, class Enable = void>
-struct has_printable_label_typedef : public std::false_type {};
-
 template <class T>
-struct has_printable_label_typedef<T,
-                                   void_t<typename T::printable_label_typedef>>
-    : public std::true_type {};
+using printable_label_typedef_t = typename T::printable_label_typedef;
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const&,
-                                                            std::false_type) {
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const&) {
   Kokkos::abort("View bounds error");
 }
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const& map,
-                                                            std::true_type) {
-  OperatorBoundsErrorOnDevice<MapType>::run(map);
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const& map) {
+  OperatorBoundsErrorOnDevice<Map>::run(map);
 }
 
 #endif  // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@@ -3885,8 +3981,7 @@ KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
        This check should cover the case of Views that don't
        have the Unmanaged trait but were initialized by pointer. */
     if (tracker.m_tracker.has_record()) {
-      operator_bounds_error_on_device<MapType>(
-          map, has_printable_label_typedef<MapType>());
+      operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("View bounds error");
     }
diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index a5f5406746..d964baa8fb 100644
--- a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -62,10 +62,10 @@ void sink(Args&&... args) {
     Kokkos::ImplSYCL::sink(__VA_ARGS__);   \
   } while (0)
 #else
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                       \
-  do {                                                                   \
-    static const __attribute__((opencl_constant)) char fmt[] = (format); \
-    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);              \
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                \
+  do {                                                            \
+    const __attribute__((opencl_constant)) char fmt[] = (format); \
+    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);       \
   } while (0)
 #endif
 #endif
diff --git a/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
index 4467b2e03c..e12d1f6a49 100644
--- a/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
@@ -56,6 +56,11 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_execution_space_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_execution_space_erroneously_given_to_execution_policy<void> {
+};
 struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
   struct base_traits {
     static constexpr auto execution_space_is_defaulted = true;
@@ -63,32 +68,30 @@ struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
     using execution_space = Kokkos::DefaultExecutionSpace;
   };
   template <class T>
-  using trait_matches_specification = is_execution_space<T>;
+  using trait_matches_specification = Kokkos::is_execution_space<T>;
+  template <class ExecSpace, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr auto show_execution_space_error_in_compilation_message =
+        show_extra_execution_space_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::execution_space_is_defaulted, void,
+                               typename base_t::execution_space>>{};
+    static_assert(base_t::execution_space_is_defaulted,
+                  "Kokkos Error: More than one execution space given. Search "
+                  "compiler output for 'show_extra_execution_space' to see the "
+                  "type of the errant tag.");
+
+    static constexpr auto execution_space_is_defaulted = false;
+
+    using execution_space = ExecSpace;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class ExecutionSpace, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>,
-    ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-
-  static_assert(base_t::execution_space_is_defaulted,
-                "Kokkos Error: More than one execution space given");
-
-  static constexpr bool execution_space_is_defaulted = false;
-
-  using execution_space = ExecutionSpace;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
index eb649dc088..b57dfbbc07 100644
--- a/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
@@ -61,6 +61,12 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
   struct base_traits {
     using is_graph_kernel = std::false_type;
   };
+  template <class, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using is_graph_kernel = std::true_type;
+  };
   template <class T>
   using trait_matches_specification = std::is_same<T, IsGraphKernelTag>;
 };
@@ -68,19 +74,6 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  using is_graph_kernel = std::true_type;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
index e15adc1711..63446375fb 100644
--- a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -46,54 +46,71 @@
 #define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
 
 #include <Kokkos_Macros.hpp>
-#include <Kokkos_Concepts.hpp>  // IndexType, is_index_type
+#include <Kokkos_Concepts.hpp>  // IndexType
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
+template <class Trait, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_index_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_index_type_erroneously_given_to_execution_policy<void> {};
 struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> {
   struct base_traits {
     static constexpr bool index_type_is_defaulted = true;
     using index_type = dependent_policy_trait_default;
   };
-  template <class T>
-  using trait_matches_specification =
-      std::integral_constant<bool, std::is_integral<T>::value ||
-                                       is_index_type<T>::value>;
+  template <class IdxType, class AnalyzeNextTrait>
+  using mixin_matching_trait = IndexTypePolicyMixin<IdxType, AnalyzeNextTrait>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="IndexTypePolicyMixin specializations"> {{{1
 
 // Index type given as IndexType template
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin<Kokkos::IndexType<IntegralIndexType>,
+                            AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
 
-// IndexType given as an integral type directly
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<std::is_integral<IntegralIndexType>::value>,
-    IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+// IndexType given as an integral type directly (the matcher already checks
+// this, so we don't have specialize to re-check it here)
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
+  static_assert(std::is_integral<IntegralIndexType>::value, "");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
@@ -101,6 +118,22 @@ struct AnalyzeExecPolicy<
 // </editor-fold> end AnalyzeExecPolicy specializations }}}1
 //==============================================================================
 
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<IndexTypeTrait, IndexType<IntegralIndexType>>
+    : std::true_type {};
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<
+    IndexTypeTrait, IntegralIndexType,
+    std::enable_if_t<std::is_integral<IntegralIndexType>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher specialization"> }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
index 30e07039a4..b05f3b29e9 100644
--- a/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
@@ -45,8 +45,11 @@
 #ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 #define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 
-#include <Kokkos_Concepts.hpp>  // is_iteration_pattern
-#include <type_traits>          // is_void
+#include <Kokkos_Concepts.hpp>                   // is_iteration_pattern
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>  // TraitSpecificationBase
+#include <Kokkos_Rank.hpp>                       // Rank
+#include <Kokkos_Layout.hpp>                     // Iterate
+#include <type_traits>                           // is_void
 
 namespace Kokkos {
 namespace Impl {
@@ -54,32 +57,42 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+    void> {};
 struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> {
   struct base_traits {
     using iteration_pattern = void;  // TODO set default iteration pattern
   };
-  template <class T>
-  using trait_matches_specification = is_iteration_pattern<T>;
+  template <class IterPattern, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    static constexpr auto show_iteration_pattern_error_in_compilation_message =
+        show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+            typename base_t::iteration_pattern>{};
+    static_assert(
+        std::is_void<typename base_t::iteration_pattern>::value,
+        "Kokkos Error: More than one index type given. Search "
+        "compiler output for 'show_extra_iteration_pattern' to see the "
+        "type of the errant tag.");
+    using iteration_pattern = IterPattern;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class IterationPattern, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<is_iteration_pattern<IterationPattern>::value>,
-    IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::iteration_pattern>::value,
-                "Kokkos Error: More than one iteration pattern given");
-  using iteration_pattern = IterationPattern;
-};
+template <unsigned N, Iterate OuterDir, Iterate InnerDir>
+struct PolicyTraitMatcher<IterationPatternTrait, Rank<N, OuterDir, InnerDir>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end  }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
index 73ae8e27e2..06836bef8b 100644
--- a/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
@@ -62,29 +62,33 @@ struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> {
 
     using launch_bounds = LaunchBounds<>;
   };
-  template <class T>
-  using trait_matches_specification = is_launch_bounds<T>;
+  template <class LaunchBoundParam, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr bool launch_bounds_is_defaulted = false;
+
+    static_assert(base_t::launch_bounds_is_defaulted,
+                  "Kokkos Error: More than one launch_bounds given");
+
+    using launch_bounds = LaunchBoundParam;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <unsigned int MaxT, unsigned int MinB, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::launch_bounds_is_defaulted,
-                "Kokkos Error: More than one launch_bounds given");
-  static constexpr bool launch_bounds_is_defaulted = false;
-  using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>;
-};
+template <unsigned int maxT, unsigned int minB>
+struct PolicyTraitMatcher<LaunchBoundsTrait, LaunchBounds<maxT, minB>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
index 3deb4a94d5..73be14cf85 100644
--- a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -82,6 +82,9 @@ struct MaximizeOccupancy {
 
 namespace Impl {
 
+template <class Policy, class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="Occupancy control trait specification"> {{{1
 
@@ -94,6 +97,9 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
       return occupancy_control{};
     }
   };
+  template <class OccControl, class AnalyzeNextTrait>
+  using mixin_matching_trait =
+      OccupancyControlPolicyMixin<OccControl, AnalyzeNextTrait>;
   template <class T>
   using trait_matches_specification = std::integral_constant<
       bool,
@@ -105,39 +111,33 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="OccupancyControlPolicyMixin specializations"> {{{1
 
-// The DesiredOccupancy case has runtime storage, so we need to handle copies
-// and assignments
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
- public:
-  using base_t            = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::DesiredOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t            = AnalyzeNextTrait;
   using occupancy_control = Kokkos::Experimental::DesiredOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = true;
 
-  template <class OccControl>
-  using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>;
-
   // Treat this as private, but make it public so that MSVC will still treat
   // this as a standard layout class and make it the right size: storage for a
   // stateful desired occupancy
   //   private:
-  occupancy_control m_desired_occupancy;
+  occupancy_control m_desired_occupancy = occupancy_control{};
 
-  AnalyzeExecPolicy() = default;
+  OccupancyControlPolicyMixin() = default;
   // Converting constructor
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other)
+  OccupancyControlPolicyMixin(ExecPolicyTraitsWithDefaults<Other> const& other)
       : base_t(other),
         m_desired_occupancy(other.impl_get_occupancy_control()) {}
 
   // Converting assignment operator
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy& operator=(
+  OccupancyControlPolicyMixin& operator=(
       ExecPolicyTraitsWithDefaults<Other> const& other) {
     *static_cast<base_t*>(this) = other;
     this->impl_set_desired_occupancy(
@@ -160,16 +160,16 @@ struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
   }
 };
 
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::MaximizeOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
   using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = false;
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end OccupancyControlPolicyMixin specializations }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
index b087dac855..e500dd4e83 100644
--- a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -73,7 +73,7 @@ namespace Impl {
 // something that we can default to in the unspecialized case, just like we
 // do for AnalyzeExecPolicy
 template <class TraitSpec, class Trait, class Enable = void>
-struct PolicyTraitMatcher;
+struct PolicyTraitMatcher : std::false_type {};
 
 template <class TraitSpec, class Trait>
 struct PolicyTraitMatcher<
diff --git a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
new file mode 100644
index 0000000000..31927320bf
--- /dev/null
+++ b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
@@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>  // type_list
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+#ifndef KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+#define KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher"> {{{1
+
+// To handle the WorkTag case, we need more than just a predicate; we need
+// something that we can default to in the unspecialized case, just like we
+// do for AnalyzeExecPolicy
+template <class TraitSpec, class Trait, class Enable = void>
+struct PolicyTraitMatcher : std::false_type {};
+
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcher<
+    TraitSpec, Trait,
+    std::enable_if_t<
+        TraitSpec::template trait_matches_specification<Trait>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
diff --git a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
index 74bab6fce2..3e578f9060 100644
--- a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -57,34 +57,43 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy<void> {};
 struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> {
   struct base_traits {
     static constexpr auto schedule_type_is_defaulted = true;
 
     using schedule_type = Schedule<Static>;
   };
-  template <class T>
-  using trait_matches_specification = is_schedule_type<T>;
+  template <class Sched, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using schedule_type = Sched;
+    static constexpr auto show_schedule_type_error_in_compilation_message =
+        show_extra_schedule_type_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::schedule_type_is_defaulted, void,
+                               typename base_t::schedule_type>>{};
+    static_assert(base_t::schedule_type_is_defaulted,
+                  "Kokkos Error: More than one schedule type given. Search "
+                  "compiler output for 'show_extra_schedule_type' to see the "
+                  "type of the errant tag.");
+    static constexpr bool schedule_type_is_defaulted = false;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class ScheduleType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::schedule_type_is_defaulted,
-                "Kokkos Error: More than one schedule type given");
-  static constexpr bool schedule_type_is_defaulted = false;
-  using schedule_type = Kokkos::Schedule<ScheduleType>;
-};
+template <class Sched>
+struct PolicyTraitMatcher<ScheduleTrait, Schedule<Sched>> : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
index b8b9a0ca2d..b8289ca618 100644
--- a/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
@@ -51,9 +51,15 @@ namespace Impl {
 template <class Enable, class... TraitsList>
 struct AnalyzeExecPolicy;
 
+template <class Enable, class TraitSpecList, class... Traits>
+struct AnalyzeExecPolicyUseMatcher;
+
 template <class AnalysisResults>
 struct ExecPolicyTraitsWithDefaults;
 
+template <class TraitSpec, class Trait, class Enable>
+struct PolicyTraitMatcher;
+
 template <class TraitSpec, template <class...> class PolicyTemplate,
           class AlreadyProcessedList, class ToProcessList, class NewTrait,
           class Enable = void>
@@ -67,6 +73,40 @@ struct PolicyTraitAdaptor;
 // traits
 struct dependent_policy_trait_default;
 
+//==============================================================================
+// <editor-fold desc="Execution policy trait specifications"> {{{1
+
+struct ExecutionSpaceTrait;
+struct IndexTypeTrait;
+struct ScheduleTrait;
+struct IterationPatternTrait;
+struct WorkItemPropertyTrait;
+struct LaunchBoundsTrait;
+struct OccupancyControlTrait;
+struct GraphKernelTrait;
+struct WorkTagTrait;
+
+// Keep these sorted by frequency of use to reduce compilation time
+//
+// clang-format off
+using execution_policy_trait_specifications =
+  type_list<
+    ExecutionSpaceTrait,
+    IndexTypeTrait,
+    ScheduleTrait,
+    IterationPatternTrait,
+    WorkItemPropertyTrait,
+    LaunchBoundsTrait,
+    OccupancyControlTrait,
+    GraphKernelTrait,
+    // This one has to be last, unfortunately:
+    WorkTagTrait
+  >;
+// clang-format on
+
+// </editor-fold> end Execution policy trait specifications }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
index 2656316fb9..35671d19b0 100644
--- a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -60,6 +60,12 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
   struct base_traits {
     using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t;
   };
+  template <class WorkItemProp, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_item_property = WorkItemProp;
+  };
   template <class T>
   using trait_matches_specification =
       Kokkos::Experimental::is_work_item_property<T>;
@@ -68,26 +74,6 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class Property, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<
-        Kokkos::Experimental::is_work_item_property<Property>::value>,
-    Property, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(
-      std::is_same<typename base_t::work_item_property,
-                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
-      "Kokkos Error: More than one work item property given");
-  using work_item_property = Property;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 
 namespace Experimental {
diff --git a/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
index 877005756a..424e5c405b 100644
--- a/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -49,6 +49,7 @@
 #include <Kokkos_Concepts.hpp>  // is_execution_space
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
+#include <impl/Kokkos_Utilities.hpp>  // type_list_any, type_list_remove_first
 
 namespace Kokkos {
 namespace Impl {
@@ -56,68 +57,65 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_work_tag_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_work_tag_erroneously_given_to_execution_policy<void> {};
+
+using _exec_policy_traits_without_work_tag = typename type_list_remove_first<
+    WorkTagTrait, execution_policy_trait_specifications>::type;
+
+template <class Trait>
+struct _trait_matches_spec_predicate {
+  template <class TraitSpec>
+  struct apply {
+    using type = typename PolicyTraitMatcher<TraitSpec, Trait>::type;
+    static constexpr bool value = type::value;
+  };
+};
+
 struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
   struct base_traits {
     using work_tag = void;
   };
+  template <class WorkTag, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_tag = WorkTag;
+    static constexpr auto show_work_tag_error_in_compilation_message =
+        show_extra_work_tag_erroneously_given_to_execution_policy<
+            typename base_t::work_tag>{};
+    static_assert(
+        std::is_void<typename base_t::work_tag>::value,
+        "Kokkos Error: More than one work tag given. Search compiler output "
+        "for 'show_extra_work_tag' to see the type of the errant tag.");
+  };
+  // Since we don't have subsumption in pre-C++20, we need to have the work tag
+  // "trait" handling code ensure that none of the other conditions are met.
+  // * Compile time cost complexity note: at first glance it looks like this
+  //   "rechecks" all of the other trait specs when used in the context of the
+  //   full list of execution policy traits, but actually since we've already
+  //   checked all of them to get to the end of the list, the compiler will
+  //   have already generated those definitions, so there should be little extra
+  //   cost to this. However, in the scenario where we use work tag in isolation
+  //   (like if we were to add a `require()`-like thing that changes the work
+  //   tag of an existing execution policy instance), we need to check all of
+  //   the other traits to make sure that we're not replacing something else,
+  //   given that the concept of a work tag is basically unconstrained and could
+  //   be anything.  This should still be as efficient at compile time as the
+  //   old code that just did a big long series of nested std::conditionals, but
+  //   we should benchmark this assumption if it becomes a problem.
+  template <class T>
+  using trait_matches_specification = std::integral_constant<
+      bool, !std::is_void<T>::value &&
+                !type_list_any<_trait_matches_spec_predicate<T>::template apply,
+                               _exec_policy_traits_without_work_tag>::value>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// Since we don't have subsumption in pre-C++20, we need to have the work tag
-// "trait" handling code be unspecialized, so we handle it instead in a class
-// with a different name.
-template <class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-};
-
-template <class WorkTag, class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::work_tag>::value,
-                "Kokkos Error: More than one work tag given");
-  using work_tag = WorkTag;
-};
-
-// This only works if this is not a partial specialization, so we have to
-// do the partial specialization elsewhere
-template <class Enable, class... Traits>
-struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> {
-  using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>;
-  using base_t::base_t;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1
-
-// In order to match the work tag trait the work tag "matcher" needs to be
-// unspecialized and the logic needs to be handled in a differently-named class,
-// just like above.
-template <class TraitSpec, class Trait>
-struct PolicyTraitMatcherHandleWorkTag : std::false_type {};
-
-template <class Trait>
-struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait>
-    : std::integral_constant<bool, !std::is_void<Trait>::value> {};
-
-template <class TraitSpec, class Trait, class Enable>
-struct PolicyTraitMatcher /* unspecialized! */
-    : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {};
-
-// </editor-fold> end PolicyTraitMatcher specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index 5826208851..89b8ff1e4f 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -41,10 +41,10 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
 
 # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10)
 ELSE()
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14)
 ENDIF()
 
 SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget)
@@ -65,6 +65,21 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
+SET(COMPILE_ONLY_SOURCES
+  TestDetectionIdiom.cpp
+  TestInterOp.cpp
+  TestTypeList.cpp
+)
+# TestInterOp has a dependency on containers
+IF(KOKKOS_HAS_TRILINOS)
+  LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp)
+ENDIF()
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  ${COMPILE_ONLY_SOURCES}
+)
+
 foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   # Because there is always an exception to the rule
   if(Tag STREQUAL "Threads")
@@ -98,6 +113,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         Complex
         Crs
         DeepCopyAlignment
+        ExecutionSpace
         FunctorAnalysis
         Init
         LocalDeepCopy
@@ -107,6 +123,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         MDRange_c
         HostSharedPtr
         HostSharedPtrAccessOnDevice
+        QuadPrecisionMath
+        ExecSpacePartitioning
+        MathematicalSpecialFunctions
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -190,7 +209,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     elseif(Tag STREQUAL "HIP")
       set(TagHostAccessible HIPHostPinned)
     elseif(Tag STREQUAL "SYCL")
-      set(TagHostAccessible SYCLSharedUSMSpace)
+      set(TagHostAccessible SYCLSharedUSM)
     endif()
 
     set(${Tag}_SOURCES2B)
@@ -257,6 +276,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   endif()
 endforeach()
 
+foreach(PairDeviceSpace HIP-HostPinned;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\1" DEVICE ${PairDeviceSpace})
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\2" SPACE ${PairDeviceSpace})
+
+  string(TOUPPER ${DEVICE} UPPER_DEVICE)
+  string(TOLOWER ${DEVICE} dir)
+
+  if(Kokkos_ENABLE_${UPPER_DEVICE})
+    set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir})
+    file(MAKE_DIRECTORY ${dir})
+    foreach(Name
+      SharedAlloc
+      ViewAPI_a
+      ViewAPI_b
+      ViewAPI_c
+      ViewAPI_d
+      ViewAPI_e
+      ViewCopy_a
+      ViewCopy_b
+      ViewMapping_a
+      ViewMapping_b
+      ViewMapping_subview
+      )
+      set(file ${dir}/Test${DEVICE}${SPACE}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${DEVICE}${SPACE}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${DEVICE}_SOURCES3 ${file})
+    endforeach()
+    list(APPEND ${DEVICE}_SOURCES ${${DEVICE}_SOURCES3})
+  endif()
+endforeach()
+
 if(Kokkos_ENABLE_OPENMPTARGET)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp
@@ -264,9 +320,7 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp
@@ -278,9 +332,16 @@ if(Kokkos_ENABLE_OPENMPTARGET)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-IF(KOKKOS_ENABLE_OPENMPTARGET
-   AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
@@ -370,14 +431,19 @@ if(Kokkos_ENABLE_PTHREAD)
   )
 endif()
 
-if(Kokkos_ENABLE_OPENMP)
+if (Kokkos_ENABLE_OPENMP)
+  set(OpenMP_EXTRA_SOURCES
+    openmp/TestOpenMP_Task.cpp
+  )
+  if (Kokkos_ENABLE_DEPRECATED_CODE_3)
+    list(APPEND OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp)
+  endif ()
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMP
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMP_SOURCES}
-    openmp/TestOpenMP_PartitionMaster.cpp
-    openmp/TestOpenMP_Task.cpp
+    ${OpenMP_EXTRA_SOURCES}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMPInterOp
@@ -463,28 +529,7 @@ if(Kokkos_ENABLE_CUDA)
       UnitTestMainInit.cpp
       cuda/TestCuda_Task.cpp
       cuda/TestCuda_TeamScratchStreams.cpp
-      cuda/TestCudaHostPinned_SharedAlloc.cpp
-      cuda/TestCudaHostPinned_ViewAPI_a.cpp
-      cuda/TestCudaHostPinned_ViewAPI_b.cpp
-      cuda/TestCudaHostPinned_ViewAPI_c.cpp
-      cuda/TestCudaHostPinned_ViewAPI_d.cpp
-      cuda/TestCudaHostPinned_ViewAPI_e.cpp
-      cuda/TestCudaHostPinned_ViewCopy_a.cpp
-      cuda/TestCudaHostPinned_ViewCopy_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_a.cpp
-      cuda/TestCudaHostPinned_ViewMapping_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_subview.cpp
-      cuda/TestCudaUVM_SharedAlloc.cpp
-      cuda/TestCudaUVM_ViewAPI_a.cpp
-      cuda/TestCudaUVM_ViewAPI_b.cpp
-      cuda/TestCudaUVM_ViewAPI_c.cpp
-      cuda/TestCudaUVM_ViewAPI_d.cpp
-      cuda/TestCudaUVM_ViewAPI_e.cpp
-      cuda/TestCudaUVM_ViewCopy_a.cpp
-      cuda/TestCudaUVM_ViewCopy_b.cpp
-      cuda/TestCudaUVM_ViewMapping_a.cpp
-      cuda/TestCudaUVM_ViewMapping_b.cpp
-      cuda/TestCudaUVM_ViewMapping_subview.cpp
+      ${Cuda_SOURCES3}
       cuda/TestCuda_Spaces.cpp
   )
 
@@ -524,17 +569,8 @@ if(Kokkos_ENABLE_HIP)
       ${HIP_SOURCES}
       hip/TestHIP_ScanUnit.cpp
       hip/TestHIP_TeamScratchStreams.cpp
-      hip/TestHIPHostPinned_ViewAPI_a.cpp
-      hip/TestHIPHostPinned_ViewAPI_b.cpp
-      hip/TestHIPHostPinned_ViewAPI_c.cpp
-      hip/TestHIPHostPinned_ViewAPI_d.cpp
-      hip/TestHIPHostPinned_ViewAPI_e.cpp
-      hip/TestHIPHostPinned_ViewCopy_a.cpp
-      hip/TestHIPHostPinned_ViewCopy_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_a.cpp
-      hip/TestHIPHostPinned_ViewMapping_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_subview.cpp
       hip/TestHIP_AsyncLauncher.cpp
+      hip/TestHIP_BlocksizeDeduction.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HIPInterOpInit
@@ -595,13 +631,25 @@ if(Kokkos_ENABLE_SYCL)
       ${SYCL_SOURCES2C}
   )
 
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2D}
   )
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL3
+    SOURCES
+      UnitTestMainInit.cpp
+      # FIXME_SYCL
+      sycl/TestSYCL_Task.cpp
+      sycl/TestSYCL_TeamScratchStreams.cpp
+      ${SYCL_SOURCES3}
+      sycl/TestSYCL_Spaces.cpp
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCLInterOpInit
     SOURCES
       UnitTestMain.cpp
@@ -622,8 +670,7 @@ if(Kokkos_ENABLE_SYCL)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-if (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+if (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
     default/TestDefaultDeviceType.cpp
@@ -685,11 +732,21 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
 )
 
   if(KOKKOS_ENABLE_TUNING)
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_TuningBuiltins
+      SOURCES
+      tools/TestBuiltinTuners.cpp
+    )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
       UnitTest_TuningBasics
       SOURCES
         tools/TestTuning.cpp
     )
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_CategoricalTuner
+      SOURCES
+      tools/TestCategoricalTuner.cpp
+    )
   endif()
   if(NOT Kokkos_ENABLE_OPENMPTARGET)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
@@ -698,6 +755,11 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       tools/TestLogicalSpaces.cpp
   )
   endif()
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_EventCorrectness
+    SOURCES
+    tools/TestEventCorrectness.cpp
+  )
   if(KOKKOS_ENABLE_LIBDL)
 
     KOKKOS_ADD_TEST_LIBRARY(
@@ -745,7 +807,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-args="-c test delimit"
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
 
     # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
@@ -762,7 +824,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
             --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index 390fc79a47..4226282214 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -73,6 +73,8 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   ) \
 )
 
+GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview
+
 SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13
 
 KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST))
@@ -94,6 +96,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
       )\
     )
 
+    GPU_SPACES = CudaHostPinned CudaUVM
+    tmp := $(foreach space, $(GPU_SPACES), \
+      tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+        $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+          $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+          $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+        )\
+      )\
+    )
+
     OBJ_CUDA = UnitTestMainInit.o gtest-all.o
     OBJ_CUDA += TestCuda_Init.o
     OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o
@@ -261,6 +273,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+	GPU_SPACES = HIPHostPinned
+	tmp := $(foreach space, $(GPU_SPACES), \
+	  tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+	    $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+	      $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+	      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+	    )\
+	  )\
+	)
+
 	OBJ_HIP = UnitTestMainInit.o gtest-all.o
 	OBJ_HIP += TestHIP_Init.o
 	OBJ_HIP += TestHIP_Reducers_a.o TestHIP_Reducers_b.o TestHIP_Reducers_c.o TestHIP_Reducers_d.o
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
index 04362125c0..257ad2e9e5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -81,6 +81,56 @@ struct InitFunctor {
   InitFunctor(T _init_value) : init_value(_init_value) {}
 };
 
+//---------------------------------------------------
+//--------------atomic_load/store/assign---------------------
+//---------------------------------------------------
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+template <class T, class DEVICE_TYPE>
+struct LoadStoreFunctor {
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    T old = Kokkos::atomic_load(&data());
+    if (old != i0)
+      Kokkos::abort("Kokkos Atomic Load didn't get the right value");
+    Kokkos::atomic_store(&data(), i1);
+    Kokkos::atomic_assign(&data(), old);
+  }
+  LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+};
+#endif
+
+template <class T, class DeviceType>
+bool LoadStoreAtomicTest(T i0, T i1) {
+  using execution_space = typename DeviceType::execution_space;
+  struct InitFunctor<T, execution_space> f_init(i0);
+  typename InitFunctor<T, execution_space>::type data("Data");
+  typename InitFunctor<T, execution_space>::h_type h_data("HData");
+
+  f_init.data = data;
+  Kokkos::parallel_for(1, f_init);
+  execution_space().fence();
+
+#ifdef KOKKOS_ENABLE_DESUL_ATOMICS
+  struct LoadStoreFunctor<T, execution_space> f(i0, i1);
+
+  f.data = data;
+  Kokkos::parallel_for(1, f);
+#else
+  h_data() = i1;
+#endif
+
+  Kokkos::deep_copy(h_data, data);
+
+  return h_data() == i0;
+}
+
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
@@ -594,7 +644,10 @@ struct AndFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_and(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_and(&data(), (T)i1);
+    Kokkos::atomic_and(&data(), result);
+  }
 
   AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -665,7 +718,10 @@ struct OrFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_or(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_or(&data(), (T)i1);
+    Kokkos::atomic_or(&data(), result);
+  }
 
   OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -954,6 +1010,7 @@ bool AtomicOperationsTestIntegralType(int i0, int i1, int test) {
     case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 11: return IncAtomicTest<T, DeviceType>((T)i0);
     case 12: return DecAtomicTest<T, DeviceType>((T)i0);
+    case 13: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
@@ -966,6 +1023,7 @@ bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) {
     case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 5: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
index ba9937e1c6..303f5b6eb9 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_double) {
                  double, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  double, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 double, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
index aa56b5ff10..d3d4916b4e 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_float) {
                  float, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  float, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 float, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
index f828be6223..e5f2f334fc 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_int) {
                  int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
index eee44c9571..d4fda70e80 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_long) {
                  long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
index 73d4a61d72..b7fb0cdae5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_longlong) {
                  long long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
index 02f337c57c..c3c6bc9fb3 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsigned) {
                  unsigned int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
index f4340475f5..f3be4bedb7 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsignedlong) {
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicViews.hpp b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
index b615b407f3..e029ad81f5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -245,11 +245,11 @@ class TestAtomicViewAPI {
     ASSERT_EQ(ax.use_count(), size_t(4));
     ASSERT_EQ(const_ax.use_count(), ax.use_count());
 
-    ASSERT_FALSE(ax.data() == nullptr);
-    ASSERT_FALSE(const_ax.data() == nullptr);  // referenceable ptr
-    ASSERT_FALSE(unmanaged_ax.data() == nullptr);
-    ASSERT_FALSE(unmanaged_ax_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(ay.data() == nullptr);
+    ASSERT_NE(ax.data(), nullptr);
+    ASSERT_NE(const_ax.data(), nullptr);  // referenceable ptr
+    ASSERT_NE(unmanaged_ax.data(), nullptr);
+    ASSERT_NE(unmanaged_ax_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(ay.data(), nullptr);
     //    ASSERT_NE( ax, ay );
     //    Above test results in following runtime error from gtest:
     //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F
@@ -278,7 +278,7 @@ class TestAtomicViewAPI {
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg_const,
       const Kokkos::View<const DataType, device,
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -290,8 +290,8 @@ class TestAtomicViewAPI {
     typeX x("X");
     const_typeX xc = x;
 
-    // ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
-    // ASSERT_TRUE( x == xc );
+    // ASSERT_EQ( xc ,  x ); // const xc is referenceable, non-const x is not
+    // ASSERT_EQ( x ,  xc );
 
     check_auto_conversion_to_const(x, xc);
   }
diff --git a/lib/kokkos/core/unit_test/TestAtomics.hpp b/lib/kokkos/core/unit_test/TestAtomics.hpp
index e41ad5257d..f2993914a1 100644
--- a/lib/kokkos/core/unit_test/TestAtomics.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomics.hpp
@@ -97,7 +97,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator+(const SuperScalar& src) {
+  SuperScalar operator+(const SuperScalar& src) const {
     SuperScalar tmp = *this;
     for (int i = 0; i < N; i++) {
       tmp.val[i] += src.val[i];
@@ -540,8 +540,6 @@ TEST(TEST_CATEGORY, atomics) {
 
 // FIXME_SYCL atomics for large types to be implemented
 #ifndef KOKKOS_ENABLE_SYCL
-  // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly
-#ifndef KOKKOS_ENABLE_HIP
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE(
@@ -567,7 +565,6 @@ TEST(TEST_CATEGORY, atomics) {
 #endif
 #endif
 #endif
-#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp
index b926058ebf..be0c1e50d7 100644
--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@@ -515,4 +515,44 @@ TEST(TEST_CATEGORY, complex_issue_3867) {
 #undef CHECK_POW_COMPLEX_PROMOTION
 }
 
+TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) {
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+  STATIC_ASSERT(Kokkos::real(1) == 1.);
+  STATIC_ASSERT(Kokkos::real(2.f) == 2.f);
+  STATIC_ASSERT(Kokkos::real(3.) == 3.);
+  STATIC_ASSERT(Kokkos::real(4.l) == 4.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  STATIC_ASSERT(Kokkos::imag(1) == 0.);
+  STATIC_ASSERT(Kokkos::imag(2.f) == 0.f);
+  STATIC_ASSERT(Kokkos::imag(3.) == 0.);
+  STATIC_ASSERT(Kokkos::imag(4.l) == 0.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  // FIXME in principle could be checked at compile time too
+  ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1));
+  ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex<float>(2.f));
+  ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex<double>(3.));
+  ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l));
+  STATIC_ASSERT((
+      std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(2.f)),
+                              Kokkos::complex<float>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(3.)),
+                              Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(4.l)),
+                              Kokkos::complex<long double>>::value));
+
+#undef STATIC_ASSERT
+}
+
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 49f8daf89e..f487a015fb 100644
--- a/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -296,7 +296,7 @@ struct TestDeepCopyScalarConversion {
 
     int64_t errors = 0;
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
 
     Kokkos::deep_copy(view_s1_1d, static_cast<Scalar1>(0));
     Kokkos::deep_copy(view_s1_2d, static_cast<Scalar1>(0));
@@ -306,7 +306,7 @@ struct TestDeepCopyScalarConversion {
                                              Kokkos::IndexType<int64_t>>(0, N0),
                          *this);
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors > 0);
+    ASSERT_GT(errors, 0);
 
     Kokkos::deep_copy(error_count, 0);
     Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_1d, view_s2_1d);
@@ -318,7 +318,7 @@ struct TestDeepCopyScalarConversion {
                          *this);
 
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 };
 }  // namespace Impl
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 8a9263c8df..90e485998e 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -79,7 +79,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
   int numa_idx    = (do_other ? 3 : 0) + (do_threads ? 1 : 0);
   int device_idx =
       (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0);
-  int tune_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
+  int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
                  (do_device ? 1 : 0);
 
   if (do_threads) {
diff --git a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
new file mode 100644
index 0000000000..f87fda6156
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_DetectionIdiom.hpp>
+
+#define STATIC_ASSERT(cond) static_assert(cond, "");
+
+void test_nonesuch() {
+  using Kokkos::nonesuch;
+  STATIC_ASSERT(!std::is_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_destructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_copy_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_move_constructible<nonesuch>::value);
+#ifdef KOKKOS_ENABLE_CXX17
+  STATIC_ASSERT(!std::is_aggregate<nonesuch>::value);
+#endif
+}
+
+#undef STATIC_ASSERT
+
+namespace Example {
+// Example from https://en.cppreference.com/w/cpp/experimental/is_detected
+template <class T>
+using copy_assign_t = decltype(std::declval<T&>() = std::declval<const T&>());
+
+struct Meow {};
+struct Purr {
+  void operator=(const Purr&) = delete;
+};
+
+static_assert(Kokkos::is_detected<copy_assign_t, Meow>::value,
+              "Meow should be copy assignable!");
+static_assert(!Kokkos::is_detected<copy_assign_t, Purr>::value,
+              "Purr should not be copy assignable!");
+static_assert(Kokkos::is_detected_exact<Meow&, copy_assign_t, Meow>::value,
+              "Copy assignment of Meow should return Meow&!");
+
+template <class T>
+using diff_t = typename T::difference_type;
+
+template <class Ptr>
+using difference_type = Kokkos::detected_or_t<std::ptrdiff_t, diff_t, Ptr>;
+
+struct Woof {
+  using difference_type = int;
+};
+struct Bark {};
+
+static_assert(std::is_same<difference_type<Woof>, int>::value,
+              "Woof's difference_type should be int!");
+static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
+              "Bark's difference_type should be ptrdiff_t!");
+}  // namespace Example
+
+int main() {}
diff --git a/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
new file mode 100644
index 0000000000..f8f5275d3d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+namespace {
+struct SumFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, int& lsum) const { lsum += i; }
+};
+
+template <class ExecSpace>
+void check_distinctive(ExecSpace, ExecSpace) {}
+
+#ifdef KOKKOS_ENABLE_CUDA
+void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) {
+  ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+void check_distinctive(Kokkos::Experimental::HIP exec1,
+                       Kokkos::Experimental::HIP exec2) {
+  ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+void check_distinctive(Kokkos::Experimental::SYCL exec1,
+                       Kokkos::Experimental::SYCL exec2) {
+  ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
+            *exec2.impl_internal_space_instance()->m_queue);
+}
+#endif
+}  // namespace
+
+void test_partitioning(std::vector<TEST_EXECSPACE>& instances) {
+  check_distinctive(instances[0], instances[1]);
+  int sum1, sum2;
+  int N = 3910;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[0], 0, N), SumFunctor(),
+      sum1);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[1], 0, N), SumFunctor(),
+      sum2);
+  ASSERT_EQ(sum1, sum2);
+  ASSERT_EQ(sum1, N * (N - 1) / 2);
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL)
+  // Eliminate unused function warning
+  // (i.e. when compiling for Serial and CUDA, during Serial compilation the
+  // Cuda overload is unused ...)
+  if (sum1 != sum2) {
+#ifdef KOKKOS_ENABLE_CUDA
+    check_distinctive(Kokkos::Cuda(), Kokkos::Cuda());
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+    check_distinctive(Kokkos::Experimental::HIP(), Kokkos::Experimental::HIP());
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+    check_distinctive(Kokkos::Experimental::SYCL(),
+                      Kokkos::Experimental::SYCL());
+#endif
+  }
+#endif
+}
+
+TEST(TEST_CATEGORY, partitioning_by_args) {
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), 1, 1.);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+
+TEST(TEST_CATEGORY, partitioning_by_vector) {
+  std::vector<int> weights{1, 1};
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), weights);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp
similarity index 68%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
rename to lib/kokkos/core/unit_test/TestExecutionSpace.hpp
index 4228b5181a..8e4331e809 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
+++ b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp
@@ -42,5 +42,39 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+struct StructCopy {
+  Kokkos::DefaultExecutionSpace device;
+  Kokkos::DefaultHostExecutionSpace host;
+};
+
+template <class ExecutionSpace>
+void check_struct_copy() {
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  // FIXME_OPENMPTARGET nvlink error: Undefined reference to
+  // '_ZSt25__throw_bad_function_callv' in
+  // '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  StructCopy data;
+  parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) {
+        StructCopy data2 = data;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("%i \n", data2.device.in_parallel());
+      });
+#endif
+#endif
+}
+
+}  // namespace
+
+TEST(TEST_CATEGORY, copy_structure) { check_struct_copy<TEST_EXECSPACE>(); }
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestHalfConversion.hpp b/lib/kokkos/core/unit_test/TestHalfConversion.hpp
index 277fb1b042..992f56cc6b 100644
--- a/lib/kokkos/core/unit_test/TestHalfConversion.hpp
+++ b/lib/kokkos/core/unit_test/TestHalfConversion.hpp
@@ -53,7 +53,7 @@ void test_half_conversion_type() {
   T base                         = static_cast<T>(3.3);
   Kokkos::Experimental::half_t a = Kokkos::Experimental::cast_to_half(base);
   T b                            = Kokkos::Experimental::cast_from_half<T>(a);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 
 // TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges
 #ifndef KOKKOS_ENABLE_SYCL
@@ -67,7 +67,7 @@ void test_half_conversion_type() {
       });
 
   Kokkos::deep_copy(b, b_v);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 #endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
 #endif  // KOKKOS_ENABLE_SYCL
 }
diff --git a/lib/kokkos/core/unit_test/TestHalfOperators.hpp b/lib/kokkos/core/unit_test/TestHalfOperators.hpp
index db52a05d5d..c4cf8a7457 100644
--- a/lib/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/lib/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -269,6 +269,85 @@ enum OP_TESTS {
   N_OP_TESTS
 };
 
+template <class view_type>
+struct Functor_TestHalfVolatileOperators {
+  volatile half_t h_lhs, h_rhs;
+  view_type actual_lhs, expected_lhs;
+  double d_lhs, d_rhs;
+  Functor_TestHalfVolatileOperators(volatile half_t lhs = half_t(0),
+                                    volatile half_t rhs = half_t(0))
+      : h_lhs(lhs), h_rhs(rhs) {
+    actual_lhs   = view_type("actual_lhs", N_OP_TESTS);
+    expected_lhs = view_type("expected_lhs", N_OP_TESTS);
+    d_lhs        = cast_from_half<double>(h_lhs);
+    d_rhs        = cast_from_half<double>(h_rhs);
+    if (std::is_same<view_type, ViewTypeHost>::value) {
+      auto run_on_host = *this;
+      run_on_host(0);
+    } else {
+      Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators",
+                           Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void operator()(int) const {
+    volatile half_t tmp_lhs;
+
+    // Initialze output views to catch missing test invocations
+    for (int i = 0; i < N_OP_TESTS; ++i) {
+      actual_lhs(i)   = 1;
+      expected_lhs(i) = -1;
+    }
+
+    tmp_lhs              = h_lhs;
+    actual_lhs(ASSIGN)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(ASSIGN) = d_lhs;
+
+    actual_lhs(LT)   = h_lhs < h_rhs;
+    expected_lhs(LT) = d_lhs < d_rhs;
+
+    actual_lhs(LE)   = h_lhs <= h_rhs;
+    expected_lhs(LE) = d_lhs <= d_rhs;
+
+    actual_lhs(NEQ)   = h_lhs != h_rhs;
+    expected_lhs(NEQ) = d_lhs != d_rhs;
+
+    actual_lhs(GT)   = h_lhs > h_rhs;
+    expected_lhs(GT) = d_lhs > d_rhs;
+
+    actual_lhs(GE)   = h_lhs >= h_rhs;
+    expected_lhs(GE) = d_lhs >= d_rhs;
+
+    actual_lhs(EQ)   = h_lhs == h_rhs;
+    expected_lhs(EQ) = d_lhs == d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs += h_rhs;
+    actual_lhs(CADD_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_H) = d_lhs;
+    expected_lhs(CADD_H_H) += d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs -= h_rhs;
+    actual_lhs(CSUB_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CSUB_H_H) = d_lhs;
+    expected_lhs(CSUB_H_H) -= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs *= h_rhs;
+    actual_lhs(CMUL_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CMUL_H_H) = d_lhs;
+    expected_lhs(CMUL_H_H) *= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs /= h_rhs;
+    actual_lhs(CDIV_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CDIV_H_H) = d_lhs;
+    expected_lhs(CDIV_H_H) /= d_rhs;
+  }
+};
+
 template <class view_type>
 struct Functor_TestHalfOperators {
   half_t h_lhs, h_rhs;
@@ -840,8 +919,33 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
                 epsilon);
   }
 
-  // Check whether half_t is trivially copyable
-  ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+  // Test partial volatile support
+  volatile half_t _h_lhs = h_lhs;
+  volatile half_t _h_rhs = h_rhs;
+  Functor_TestHalfVolatileOperators<ViewType> f_volatile_device(_h_lhs, _h_rhs);
+  Functor_TestHalfVolatileOperators<ViewTypeHost> f_volatile_host(_h_lhs,
+                                                                  _h_rhs);
+
+  ExecutionSpace().fence();
+  Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
+  Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
+  for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
+    // printf("op_test = %d\n", op_test);
+    if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ ||
+        op_test == EQ || op_test == GT || op_test == GE ||
+        op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H ||
+        op_test == CDIV_H_H) {
+      ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
+                  epsilon);
+      ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
+                  epsilon);
+    }
+  }
+
+  // is_trivially_copyable is false with the addition of explicit
+  // copy constructors that are required for supporting reductions
+  // ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+
   constexpr size_t n       = 2;
   constexpr size_t n_bytes = sizeof(half_t) * n;
   const half_t h_arr0 = half_t(0x89ab), h_arr1 = half_t(0xcdef);
@@ -854,11 +958,11 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
   h_arr_ptr = reinterpret_cast<char*>(h_arr);
 
   std::memcpy(c_arr, h_arr, n_bytes);
-  for (i = 0; i < n_bytes; i++) ASSERT_TRUE(c_arr[i] == h_arr_ptr[i]);
+  for (i = 0; i < n_bytes; i++) ASSERT_EQ(c_arr[i], h_arr_ptr[i]);
 
   std::memcpy(h_arr, c_arr, n_bytes);
-  ASSERT_TRUE(h_arr[0] == h_arr0);
-  ASSERT_TRUE(h_arr[1] == h_arr1);
+  ASSERT_EQ(h_arr[0], h_arr0);
+  ASSERT_EQ(h_arr[1], h_arr1);
 }
 
 void test_half_operators() {
@@ -870,7 +974,6 @@ void test_half_operators() {
     // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0));
     // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i));
   }
-  // TODO: __test_half_operators(0, 0);
 }
 
 TEST(TEST_CATEGORY, half_operators) { test_half_operators(); }
diff --git a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 18d1ac8518..10180251ba 100644
--- a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -52,14 +52,17 @@ using Kokkos::Impl::HostSharedPtr;
 namespace {
 
 class Data {
-  Kokkos::Array<char, 64> d;
+  char d[64];
 
  public:
-  KOKKOS_FUNCTION void write(char const* c) {
-    for (int i = 0; i < 64 && c; ++i, ++c) {
-      d[i] = *c;
-    }
+  // Because strncpy is not supported within device code
+  static KOKKOS_FUNCTION void my_strncpy(char* dst, const char* src,
+                                         size_t cnt) {
+    while (cnt-- > 0 && (*dst++ = *src++) != '\0')
+      ;
+    while (cnt-- > 0) *dst++ = '\0';
   }
+  KOKKOS_FUNCTION void write(char const* s) { my_strncpy(d, s, sizeof(d)); }
 };
 
 template <class SmartPtr>
@@ -154,3 +157,135 @@ TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) {
   check_special_members_on_device(device_ptr);
 }
 #endif
+
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
+namespace {
+
+struct Bar {
+  double val;
+};
+
+struct Foo {
+  Foo(bool allocate = false) : ptr(allocate ? new Bar : nullptr) {}
+  Kokkos::Impl::HostSharedPtr<Bar> ptr;
+  int use_count() { return ptr.use_count(); }
+};
+
+template <class DevMemSpace, class HostMemSpace>
+void host_shared_ptr_test_reference_counting() {
+  using ExecSpace = typename DevMemSpace::execution_space;
+  bool is_gpu =
+      !Kokkos::SpaceAccessibility<ExecSpace, Kokkos::HostSpace>::accessible;
+
+  // Create two tracked instances
+  Foo f1(true), f2(true);
+  // Scope Views
+  {
+    Foo* fp_d_ptr =
+        static_cast<Foo*>(Kokkos::kokkos_malloc<DevMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, DevMemSpace> fp_d(fp_d_ptr);
+    // If using UVM or on the CPU don't make an extra HostCopy
+    Foo* fp_h_ptr = std::is_same<DevMemSpace, HostMemSpace>::value
+                        ? fp_d_ptr
+                        : static_cast<Foo*>(
+                              Kokkos::kokkos_malloc<HostMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, HostMemSpace> fp_h(fp_h_ptr);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Just for the sake of it initialize the data of the host copy
+    new (fp_h.data()) Foo();
+    // placement new in kernel
+    //  if on GPU: should not increase use_count, fp_d will not be tracked
+    //  if on Host: refcount will increase fp_d is tracked
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { new (fp_d.data()) Foo(f1); });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // assignment operator on host, will increase f2 use_count
+    //   if default device is GPU: fp_h was untracked
+    //   if default device is CPU: fp_h was tracked and use_count was 2 for
+    //   aliasing f1, in which case use_count will be decreased here
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    Kokkos::deep_copy(fp_d, fp_h);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // assignment in kernel:
+    //  If on GPU: should not increase use_count of f1 and fp_d will not be
+    //  tracked.
+    //  If on Host: use_count will increase of f1, fp_d is tracked,
+    //  use_count of f2 goes down.
+    //  Since we are messing with the use count on the device: make host copy
+    //  untracked first. Note if fp_d and fp_h alias each other (e.g. compiling
+    //  for CPU only) that means fp_d() will be untracked too during assignemnt
+    fp_h() = Foo();
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { fp_d() = f1; });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Assign non-tracked ptr
+    //   if  if_gpu will not change use_count
+    //   if !is_gpu will decrease use_count of f1
+    fp_h() = Foo();
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // before deleting host version make sure its not tracked
+    fp_h() = Foo();
+    if (fp_h_ptr != fp_d_ptr) Kokkos::kokkos_free<HostMemSpace>(fp_h_ptr);
+    Kokkos::kokkos_free<DevMemSpace>(fp_d_ptr);
+  }
+
+  ASSERT_EQ(1, f1.use_count());
+  ASSERT_EQ(1, f2.use_count());
+}
+}  // namespace
+
+TEST(TEST_CATEGORY, host_shared_ptr_tracking) {
+  host_shared_ptr_test_reference_counting<typename TEST_EXECSPACE::memory_space,
+                                          Kokkos::HostSpace>();
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+    host_shared_ptr_test_reference_counting<Kokkos::CudaUVMSpace,
+                                            Kokkos::CudaUVMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::SYCLSharedUSMSpace,
+        Kokkos::Experimental::SYCLSharedUSMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::HIPHostPinnedSpace,
+        Kokkos::Experimental::HIPHostPinnedSpace>();
+#endif
+}
+
+#endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/lib/kokkos/core/unit_test/TestInterOp.cpp b/lib/kokkos/core/unit_test/TestInterOp.cpp
new file mode 100644
index 0000000000..7f08afada9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestInterOp.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <KokkosExp_InterOp.hpp>
+
+// View
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View");
+
+// DynRankView
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<double>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView");
+
+// View + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View + Execution Space");
+
+// DynRankView + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::DynRankView<double, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Execution Space");
+
+// View + Memory space
+static_assert(std::is_same<Kokkos::Experimental::python_view_type_t<
+                               Kokkos::View<int64_t*, Kokkos::HostSpace>>,
+                           Kokkos::View<int64_t*, Kokkos::LayoutRight,
+                                        Kokkos::HostSpace>>::value,
+              "Error! Unexpected python_view_type for: View + Memory space");
+
+// DynRankView + Memory space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<
+                     Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<int16_t, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Memory space");
+
+// View + Layout + Execution space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<int**, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultExecutionSpace::memory_space>>::
+        value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space");
+
+// DynRankView + Layout + Execution space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+                 Kokkos::DynRankView<int, Kokkos::LayoutLeft,
+                                     typename Kokkos::DefaultExecutionSpace::
+                                         memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space");
+
+// View + Layout + Memory Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+        Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Memory Space");
+
+// DynRankView + Layout + Memory Space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Memory "
+    "Space");
+
+// View + Layout + Execution space + Memory Trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            float***, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::RandomAccess>>>,
+        Kokkos::View<float***, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultHostExecutionSpace::memory_space,
+                     Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space + "
+    "Memory Trait");
+
+// DynRankView + Layout + Execution space  + Memory trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>,
+        Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft,
+            typename Kokkos::DefaultHostExecutionSpace::memory_space,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space  + Memory trait");
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index 5618e40989..57461be714 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -2751,9 +2751,18 @@ struct TestMDRange_6D {
                            const int N3, const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2772,9 +2781,18 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2807,9 +2825,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2832,9 +2859,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar + label
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
@@ -2858,9 +2894,19 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar view
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type =
+          typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
+                                         Kokkos::IndexType<int>,
+                                         Kokkos::LaunchBounds<512, 1>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2888,9 +2934,18 @@ struct TestMDRange_6D {
     // Test Min reducer with lambda
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 1}});
 
@@ -2923,9 +2978,19 @@ struct TestMDRange_6D {
 
     // Tagged operator test
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2977,9 +3042,18 @@ struct TestMDRange_6D {
                         const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3028,8 +3102,16 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>;
+#endif
       using point_type = typename range_type::point_type;
 
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
@@ -3062,9 +3144,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3115,9 +3206,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3158,9 +3258,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3201,9 +3311,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3244,9 +3364,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3287,9 +3417,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3330,9 +3470,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3683,9 +3833,18 @@ struct TestMDRange_6D_NegIdx {
   static void test_6D_negidx(const int N0, const int N1, const int N2,
                              const int N3, const int N4, const int N5) {
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
diff --git a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index 777f91aea3..b38871afaa 100644
--- a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -601,7 +601,8 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))  // FIXME
+// FIXME: fails with gcc on Power platforms
+#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l);
 #endif
 #endif
@@ -668,7 +669,13 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l});
 #endif
 
+// FIXME_OPENMPTARGET FIXME_AMD
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                           \
+    (defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
+     defined(KOKKOS_ARCH_VEGA90A))
+
   TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
+#endif
   TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l});
   TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll});
   TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
@@ -869,3 +876,69 @@ TEST(TEST_CATEGORY,
 #endif
 #endif
 }
+
+template <class Space>
+struct TestAbsoluteValueFunction {
+  TestAbsoluteValueFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::Experimental::abs;
+    if (abs(1) != 1 || abs(-1) != 1) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
+    }
+    if (abs(2l) != 2l || abs(-2l) != 2l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long int)\n");
+    }
+    if (abs(3ll) != 3ll || abs(-3ll) != 3ll) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long long int)\n");
+    }
+    if (abs(4.f) != 4.f || abs(-4.f) != 4.f) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(float)\n");
+    }
+    if (abs(5.) != 5. || abs(-5.) != 5.) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (abs(6.l) != 6.l || abs(-6.l) != 6.l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::Experimental::isinf;
+    using Kokkos::Experimental::isnan;
+    if (abs(-0.) != 0.
+    // WORKAROUND icpx changing default FP model when optimization level is >= 1
+    // using -fp-model=precise works too
+#ifndef __INTEL_LLVM_COMPILER
+        || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
+#endif
+    ) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed abs(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(abs(1)), int>::value, "");
+    static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
+    static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
+    static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(abs(6.l)), long double>::value, "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_absolute_value) {
+  TestAbsoluteValueFunction<TEST_EXECSPACE>();
+}
diff --git a/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000..2d9b4db6bd
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -0,0 +1,1895 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+namespace Test {
+
+struct TestLargeArgTag {};
+struct TestRealErfcxTag {};
+
+template <class ExecSpace>
+struct TestExponentialIntergral1Function {
+  using ViewType     = Kokkos::View<double*, ExecSpace>;
+  using HostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_x, d_expint;
+  typename ViewType::HostMirror h_x, h_expint;
+  HostViewType h_ref;
+
+  void testit() {
+    using Kokkos::Experimental::fabs;
+    using Kokkos::Experimental::infinity;
+
+    d_x      = ViewType("d_x", 15);
+    d_expint = ViewType("d_expint", 15);
+    h_x      = Kokkos::create_mirror_view(d_x);
+    h_expint = Kokkos::create_mirror_view(d_expint);
+    h_ref    = HostViewType("h_ref", 15);
+
+    // Generate test inputs
+    h_x(0)  = -0.2;
+    h_x(1)  = 0.0;
+    h_x(2)  = 0.2;
+    h_x(3)  = 0.8;
+    h_x(4)  = 1.6;
+    h_x(5)  = 5.1;
+    h_x(6)  = 0.01;
+    h_x(7)  = 0.001;
+    h_x(8)  = 1.0;
+    h_x(9)  = 1.001;
+    h_x(10) = 1.01;
+    h_x(11) = 1.1;
+    h_x(12) = 7.2;
+    h_x(13) = 10.3;
+    h_x(14) = 15.4;
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call exponential integral function
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 15), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_expint, d_expint);
+
+    // Reference values computed with Octave
+    h_ref(0)  = -infinity<double>::value;  // x(0)=-0.2
+    h_ref(1)  = infinity<double>::value;   // x(1)= 0.0
+    h_ref(2)  = 1.222650544183893e+00;     // x(2) =0.2
+    h_ref(3)  = 3.105965785455429e-01;     // x(3) =0.8
+    h_ref(4)  = 8.630833369753976e-02;     // x(4) =1.6
+    h_ref(5)  = 1.021300107861738e-03;     // x(5) =5.1
+    h_ref(6)  = 4.037929576538113e+00;     // x(6) =0.01
+    h_ref(7)  = 6.331539364136149e+00;     // x(7) =0.001
+    h_ref(8)  = 2.193839343955205e-01;     // x(8) =1.0
+    h_ref(9)  = 2.190164225274689e-01;     // x(9) =1.001
+    h_ref(10) = 2.157416237944899e-01;     // x(10)=1.01
+    h_ref(11) = 1.859909045360401e-01;     // x(11)=1.1
+    h_ref(12) = 9.218811688716196e-05;     // x(12)=7.2
+    h_ref(13) = 2.996734771597901e-06;     // x(13)=10.3
+    h_ref(14) = 1.254522935050609e-08;     // x(14)=15.4
+
+    EXPECT_EQ(h_ref(0), h_expint(0));
+    EXPECT_EQ(h_ref(1), h_expint(1));
+    for (int i = 2; i < 15; i++) {
+      EXPECT_LE(std::abs(h_expint(i) - h_ref(i)), std::abs(h_ref(i)) * 1e-15);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_expint(i) = Kokkos::Experimental::expint1(d_x(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexErrorFunction {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+  using DblViewType     = Kokkos::View<double*, ExecSpace>;
+  using DblHostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_erf, d_erfcx;
+  typename ViewType::HostMirror h_z, h_erf, h_erfcx;
+  HostViewType h_ref_erf, h_ref_erfcx;
+
+  DblViewType d_x, d_erfcx_dbl;
+  typename DblViewType::HostMirror h_x, h_erfcx_dbl;
+  DblHostViewType h_ref_erfcx_dbl;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    d_z         = ViewType("d_z", 52);
+    d_erf       = ViewType("d_erf", 52);
+    d_erfcx     = ViewType("d_erfcx", 52);
+    h_z         = Kokkos::create_mirror_view(d_z);
+    h_erf       = Kokkos::create_mirror_view(d_erf);
+    h_erfcx     = Kokkos::create_mirror_view(d_erfcx);
+    h_ref_erf   = HostViewType("h_ref_erf", 52);
+    h_ref_erfcx = HostViewType("h_ref_erfcx", 52);
+
+    d_x             = DblViewType("d_x", 6);
+    d_erfcx_dbl     = DblViewType("d_erfcx_dbl", 6);
+    h_x             = Kokkos::create_mirror_view(d_x);
+    h_erfcx_dbl     = Kokkos::create_mirror_view(d_erfcx_dbl);
+    h_ref_erfcx_dbl = DblHostViewType("h_ref_erfcx_dbl", 6);
+
+    // Generate test inputs
+    // abs(z)<=2
+    h_z(0)  = Kokkos::complex<double>(0.0011, 0);
+    h_z(1)  = Kokkos::complex<double>(-0.0011, 0);
+    h_z(2)  = Kokkos::complex<double>(1.4567, 0);
+    h_z(3)  = Kokkos::complex<double>(-1.4567, 0);
+    h_z(4)  = Kokkos::complex<double>(0, 0.0011);
+    h_z(5)  = Kokkos::complex<double>(0, -0.0011);
+    h_z(6)  = Kokkos::complex<double>(0, 1.4567);
+    h_z(7)  = Kokkos::complex<double>(0, -1.4567);
+    h_z(8)  = Kokkos::complex<double>(1.4567, 0.0011);
+    h_z(9)  = Kokkos::complex<double>(1.4567, -0.0011);
+    h_z(10) = Kokkos::complex<double>(-1.4567, 0.0011);
+    h_z(11) = Kokkos::complex<double>(-1.4567, -0.0011);
+    h_z(12) = Kokkos::complex<double>(1.4567, 0.5942);
+    h_z(13) = Kokkos::complex<double>(1.4567, -0.5942);
+    h_z(14) = Kokkos::complex<double>(-1.4567, 0.5942);
+    h_z(15) = Kokkos::complex<double>(-1.4567, -0.5942);
+    h_z(16) = Kokkos::complex<double>(0.0011, 0.5942);
+    h_z(17) = Kokkos::complex<double>(0.0011, -0.5942);
+    h_z(18) = Kokkos::complex<double>(-0.0011, 0.5942);
+    h_z(19) = Kokkos::complex<double>(-0.0011, -0.5942);
+    h_z(20) = Kokkos::complex<double>(0.0011, 0.0051);
+    h_z(21) = Kokkos::complex<double>(0.0011, -0.0051);
+    h_z(22) = Kokkos::complex<double>(-0.0011, 0.0051);
+    h_z(23) = Kokkos::complex<double>(-0.0011, -0.0051);
+    // abs(z)>2.0 and x>1
+    h_z(24) = Kokkos::complex<double>(3.5, 0.0011);
+    h_z(25) = Kokkos::complex<double>(3.5, -0.0011);
+    h_z(26) = Kokkos::complex<double>(-3.5, 0.0011);
+    h_z(27) = Kokkos::complex<double>(-3.5, -0.0011);
+    h_z(28) = Kokkos::complex<double>(3.5, 9.7);
+    h_z(29) = Kokkos::complex<double>(3.5, -9.7);
+    h_z(30) = Kokkos::complex<double>(-3.5, 9.7);
+    h_z(31) = Kokkos::complex<double>(-3.5, -9.7);
+    h_z(32) = Kokkos::complex<double>(18.9, 9.7);
+    h_z(33) = Kokkos::complex<double>(18.9, -9.7);
+    h_z(34) = Kokkos::complex<double>(-18.9, 9.7);
+    h_z(35) = Kokkos::complex<double>(-18.9, -9.7);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)<6
+    h_z(36) = Kokkos::complex<double>(0.85, 3.5);
+    h_z(37) = Kokkos::complex<double>(0.85, -3.5);
+    h_z(38) = Kokkos::complex<double>(-0.85, 3.5);
+    h_z(39) = Kokkos::complex<double>(-0.85, -3.5);
+    h_z(40) = Kokkos::complex<double>(0.0011, 3.5);
+    h_z(41) = Kokkos::complex<double>(0.0011, -3.5);
+    h_z(42) = Kokkos::complex<double>(-0.0011, 3.5);
+    h_z(43) = Kokkos::complex<double>(-0.0011, -3.5);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)>=6
+    h_z(44) = Kokkos::complex<double>(0.85, 7.5);
+    h_z(45) = Kokkos::complex<double>(0.85, -7.5);
+    h_z(46) = Kokkos::complex<double>(-0.85, 7.5);
+    h_z(47) = Kokkos::complex<double>(-0.85, -7.5);
+    h_z(48) = Kokkos::complex<double>(0.85, 19.7);
+    h_z(49) = Kokkos::complex<double>(0.85, -19.7);
+    h_z(50) = Kokkos::complex<double>(-0.85, 19.7);
+    h_z(51) = Kokkos::complex<double>(-0.85, -19.7);
+
+    h_x(0) = -infinity<double>::value;
+    h_x(1) = -1.2;
+    h_x(2) = 0.0;
+    h_x(3) = 1.2;
+    h_x(4) = 10.5;
+    h_x(5) = infinity<double>::value;
+
+    Kokkos::deep_copy(d_z, h_z);
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call erf and erfcx functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 52), *this);
+    Kokkos::fence();
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestRealErfcxTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_erf, d_erf);
+    Kokkos::deep_copy(h_erfcx, d_erfcx);
+    Kokkos::deep_copy(h_erfcx_dbl, d_erfcx_dbl);
+
+    // Reference values computed with Octave
+    h_ref_erf(0) = Kokkos::complex<double>(0.001241216583181022, 0);
+    h_ref_erf(1) = Kokkos::complex<double>(-0.001241216583181022, 0);
+    h_ref_erf(2) = Kokkos::complex<double>(0.9606095744865353, 0);
+    h_ref_erf(3) = Kokkos::complex<double>(-0.9606095744865353, 0);
+    h_ref_erf(4) = Kokkos::complex<double>(0, 0.001241217584429469);
+    h_ref_erf(5) = Kokkos::complex<double>(0, -0.001241217584429469);
+    h_ref_erf(6) = Kokkos::complex<double>(0, 4.149756424218223);
+    h_ref_erf(7) = Kokkos::complex<double>(0, -4.149756424218223);
+    h_ref_erf(8) =
+        Kokkos::complex<double>(0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(9) =
+        Kokkos::complex<double>(0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(10) =
+        Kokkos::complex<double>(-0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(11) =
+        Kokkos::complex<double>(-0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(12) =
+        Kokkos::complex<double>(1.02408827958197, 0.04828570635603527);
+    h_ref_erf(13) =
+        Kokkos::complex<double>(1.02408827958197, -0.04828570635603527);
+    h_ref_erf(14) =
+        Kokkos::complex<double>(-1.02408827958197, 0.04828570635603527);
+    h_ref_erf(15) =
+        Kokkos::complex<double>(-1.02408827958197, -0.04828570635603527);
+    h_ref_erf(16) =
+        Kokkos::complex<double>(0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(17) =
+        Kokkos::complex<double>(0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(18) =
+        Kokkos::complex<double>(-0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(19) =
+        Kokkos::complex<double>(-0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(20) =
+        Kokkos::complex<double>(0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(21) =
+        Kokkos::complex<double>(0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(22) =
+        Kokkos::complex<double>(-0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(23) =
+        Kokkos::complex<double>(-0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(24) =
+        Kokkos::complex<double>(0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(25) =
+        Kokkos::complex<double>(0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(26) =
+        Kokkos::complex<double>(-0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(27) =
+        Kokkos::complex<double>(-0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(28) =
+        Kokkos::complex<double>(-1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(29) =
+        Kokkos::complex<double>(-1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(30) =
+        Kokkos::complex<double>(1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(31) =
+        Kokkos::complex<double>(1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(32) = Kokkos::complex<double>(1, 5.959897539826596e-117);
+    h_ref_erf(33) = Kokkos::complex<double>(1, -5.959897539826596e-117);
+    h_ref_erf(34) = Kokkos::complex<double>(-1, 5.959897539826596e-117);
+    h_ref_erf(35) = Kokkos::complex<double>(-1, -5.959897539826596e-117);
+    h_ref_erf(36) =
+        Kokkos::complex<double>(-9211.077162784413, 13667.93825589455);
+    h_ref_erf(37) =
+        Kokkos::complex<double>(-9211.077162784413, -13667.93825589455);
+    h_ref_erf(38) =
+        Kokkos::complex<double>(9211.077162784413, 13667.93825589455);
+    h_ref_erf(39) =
+        Kokkos::complex<double>(9211.077162784413, -13667.93825589455);
+    h_ref_erf(40) = Kokkos::complex<double>(259.38847811225, 35281.28906479814);
+    h_ref_erf(41) =
+        Kokkos::complex<double>(259.38847811225, -35281.28906479814);
+    h_ref_erf(42) =
+        Kokkos::complex<double>(-259.38847811225, 35281.28906479814);
+    h_ref_erf(43) =
+        Kokkos::complex<double>(-259.38847811225, -35281.28906479814);
+    h_ref_erf(44) =
+        Kokkos::complex<double>(6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(45) =
+        Kokkos::complex<double>(6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(46) =
+        Kokkos::complex<double>(-6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(47) =
+        Kokkos::complex<double>(-6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(48) =
+        Kokkos::complex<double>(4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(49) =
+        Kokkos::complex<double>(4.37526734926942e+166, 2.16796709605852e+166);
+    h_ref_erf(50) =
+        Kokkos::complex<double>(-4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(51) =
+        Kokkos::complex<double>(-4.37526734926942e+166, 2.16796709605852e+166);
+
+    h_ref_erfcx(0) = Kokkos::complex<double>(0.9987599919156778, 0);
+    h_ref_erfcx(1) = Kokkos::complex<double>(1.001242428085786, 0);
+    h_ref_erfcx(2) = Kokkos::complex<double>(0.3288157848563544, 0);
+    h_ref_erfcx(3) = Kokkos::complex<double>(16.36639786516915, 0);
+    h_ref_erfcx(4) =
+        Kokkos::complex<double>(0.999998790000732, -0.001241216082557101);
+    h_ref_erfcx(5) =
+        Kokkos::complex<double>(0.999998790000732, 0.001241216082557101);
+    h_ref_erfcx(6) =
+        Kokkos::complex<double>(0.1197948131677216, -0.4971192955307743);
+    h_ref_erfcx(7) =
+        Kokkos::complex<double>(0.1197948131677216, 0.4971192955307743);
+    h_ref_erfcx(8) =
+        Kokkos::complex<double>(0.3288156873503045, -0.0001874479383970247);
+    h_ref_erfcx(9) =
+        Kokkos::complex<double>(0.3288156873503045, 0.0001874479383970247);
+    h_ref_erfcx(10) =
+        Kokkos::complex<double>(16.36629202874158, -0.05369111060785572);
+    h_ref_erfcx(11) =
+        Kokkos::complex<double>(16.36629202874158, 0.05369111060785572);
+    h_ref_erfcx(12) =
+        Kokkos::complex<double>(0.3020886508118801, -0.09424097887578842);
+    h_ref_erfcx(13) =
+        Kokkos::complex<double>(0.3020886508118801, 0.09424097887578842);
+    h_ref_erfcx(14) =
+        Kokkos::complex<double>(-2.174707722732267, -11.67259764091796);
+    h_ref_erfcx(15) =
+        Kokkos::complex<double>(-2.174707722732267, 11.67259764091796);
+    h_ref_erfcx(16) =
+        Kokkos::complex<double>(0.7019810779371267, -0.5319516793968513);
+    h_ref_erfcx(17) =
+        Kokkos::complex<double>(0.7019810779371267, 0.5319516793968513);
+    h_ref_erfcx(18) =
+        Kokkos::complex<double>(0.7030703366403597, -0.5337884198542978);
+    h_ref_erfcx(19) =
+        Kokkos::complex<double>(0.7030703366403597, 0.5337884198542978);
+    h_ref_erfcx(20) =
+        Kokkos::complex<double>(0.9987340467266177, -0.005743428170378673);
+    h_ref_erfcx(21) =
+        Kokkos::complex<double>(0.9987340467266177, 0.005743428170378673);
+    h_ref_erfcx(22) =
+        Kokkos::complex<double>(1.001216353762532, -0.005765867613873103);
+    h_ref_erfcx(23) =
+        Kokkos::complex<double>(1.001216353762532, 0.005765867613873103);
+    h_ref_erfcx(24) =
+        Kokkos::complex<double>(0.1552936427089241, -4.545593205871305e-05);
+    h_ref_erfcx(25) =
+        Kokkos::complex<double>(0.1552936427089241, 4.545593205871305e-05);
+    h_ref_erfcx(26) =
+        Kokkos::complex<double>(417949.5262869648, -3218.276197742372);
+    h_ref_erfcx(27) =
+        Kokkos::complex<double>(417949.5262869648, 3218.276197742372);
+    h_ref_erfcx(28) =
+        Kokkos::complex<double>(0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(29) =
+        Kokkos::complex<double>(0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(30) =
+        Kokkos::complex<double>(-0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(31) =
+        Kokkos::complex<double>(-0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(32) =
+        Kokkos::complex<double>(0.02362328821805, -0.01209735551897239);
+    h_ref_erfcx(33) =
+        Kokkos::complex<double>(0.02362328821805, 0.01209735551897239);
+    h_ref_erfcx(34) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              -2.942443198107089e+114);
+    h_ref_erfcx(35) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              2.942443198107089e+114);
+    h_ref_erfcx(36) =
+        Kokkos::complex<double>(0.04174017523145063, -0.1569865319886248);
+    h_ref_erfcx(37) =
+        Kokkos::complex<double>(0.04174017523145063, 0.1569865319886248);
+    h_ref_erfcx(38) =
+        Kokkos::complex<double>(-0.04172154858670504, -0.156980085534407);
+    h_ref_erfcx(39) =
+        Kokkos::complex<double>(-0.04172154858670504, 0.156980085534407);
+    h_ref_erfcx(40) =
+        Kokkos::complex<double>(6.355803055239174e-05, -0.1688298297427782);
+    h_ref_erfcx(41) =
+        Kokkos::complex<double>(6.355803055239174e-05, 0.1688298297427782);
+    h_ref_erfcx(42) =
+        Kokkos::complex<double>(-5.398806789669434e-05, -0.168829903432947);
+    h_ref_erfcx(43) =
+        Kokkos::complex<double>(-5.398806789669434e-05, 0.168829903432947);
+    h_ref_erfcx(44) =
+        Kokkos::complex<double>(0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(45) =
+        Kokkos::complex<double>(0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(46) =
+        Kokkos::complex<double>(-0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(47) =
+        Kokkos::complex<double>(-0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(48) =
+        Kokkos::complex<double>(0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(49) =
+        Kokkos::complex<double>(0.001238176693606428, 0.02862247416909219);
+    h_ref_erfcx(50) =
+        Kokkos::complex<double>(-0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(51) =
+        Kokkos::complex<double>(-0.001238176693606428, 0.02862247416909219);
+
+    h_ref_erfcx_dbl(0) = infinity<double>::value;
+    h_ref_erfcx_dbl(1) = 8.062854217063865e+00;
+    h_ref_erfcx_dbl(2) = 1.0;
+    h_ref_erfcx_dbl(3) = 3.785374169292397e-01;
+    h_ref_erfcx_dbl(4) = 5.349189974656411e-02;
+    h_ref_erfcx_dbl(5) = 0.0;
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erf(i) - h_ref_erf(i)),
+                Kokkos::abs(h_ref_erf(i)) * 1e-13);
+    }
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erfcx(i) - h_ref_erfcx(i)),
+                Kokkos::abs(h_ref_erfcx(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_erfcx_dbl(0), h_ref_erfcx_dbl(0));
+    EXPECT_EQ(h_erfcx_dbl(5), h_ref_erfcx_dbl(5));
+    for (int i = 1; i < 5; i++) {
+      EXPECT_LE(std::abs(h_erfcx_dbl(i) - h_ref_erfcx_dbl(i)),
+                std::abs(h_ref_erfcx_dbl(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_erf(i)   = Kokkos::Experimental::erf(d_z(i));
+    d_erfcx(i) = Kokkos::Experimental::erfcx(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestRealErfcxTag&, const int& /*i*/) const {
+    d_erfcx_dbl(0) = Kokkos::Experimental::erfcx(d_x(0));
+    d_erfcx_dbl(1) = Kokkos::Experimental::erfcx(d_x(1));
+    d_erfcx_dbl(2) = Kokkos::Experimental::erfcx(d_x(2));
+    d_erfcx_dbl(3) = Kokkos::Experimental::erfcx(d_x(3));
+    d_erfcx_dbl(4) = Kokkos::Experimental::erfcx(d_x(4));
+    d_erfcx_dbl(5) = Kokkos::Experimental::erfcx(d_x(5));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ0Y0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj0, d_cby0;
+  typename ViewType::HostMirror h_z, h_cbj0, h_cby0;
+  HostViewType h_ref_cbj0, h_ref_cby0;
+
+  ViewType d_z_large, d_cbj0_large, d_cby0_large;
+  typename ViewType::HostMirror h_z_large, h_cbj0_large, h_cby0_large;
+  HostViewType h_ref_cbj0_large, h_ref_cby0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj0     = ViewType("d_cbj0", N);
+    d_cby0     = ViewType("d_cby0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj0     = Kokkos::create_mirror_view(d_cbj0);
+    h_cby0     = Kokkos::create_mirror_view(d_cby0);
+    h_ref_cbj0 = HostViewType("h_ref_cbj0", N);
+    h_ref_cby0 = HostViewType("h_ref_cby0", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0, d_cbj0);
+    Kokkos::deep_copy(h_cby0, d_cby0);
+
+    // Reference values computed with Octave
+    h_ref_cbj0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbj0(1) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(2) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(3) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(4) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(5) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(6) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(7) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(8) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(9) = Kokkos::complex<double>(-2.600519549019335e-01, 0);
+    h_ref_cbj0(10) =
+        Kokkos::complex<double>(-2.600519549019335e-01, +9.951051106466461e-18);
+    h_ref_cbj0(11) = Kokkos::complex<double>(-1.624127813134866e-01, 0);
+    h_ref_cbj0(12) =
+        Kokkos::complex<double>(-1.624127813134866e-01, -1.387778780781446e-17);
+    h_ref_cbj0(13) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(14) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(15) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(16) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(17) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(18) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(19) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(20) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(21) = Kokkos::complex<double>(-7.315701054899962e-02, 0);
+    h_ref_cbj0(22) =
+        Kokkos::complex<double>(-7.315701054899962e-02, -6.938893903907228e-18);
+    h_ref_cbj0(23) = Kokkos::complex<double>(-9.147180408906189e-02, 0);
+    h_ref_cbj0(24) =
+        Kokkos::complex<double>(-9.147180408906189e-02, +1.387778780781446e-17);
+
+    h_ref_cby0(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby0(1) =
+        Kokkos::complex<double>(1.000803196554890e+00, -1.231441609303427e+00);
+    h_ref_cby0(2) =
+        Kokkos::complex<double>(1.000803196554890e+00, +1.231441609303427e+00);
+    h_ref_cby0(3) =
+        Kokkos::complex<double>(-8.951643875605797e-01, -1.267028149911417e+00);
+    h_ref_cby0(4) =
+        Kokkos::complex<double>(-8.951643875605797e-01, +1.267028149911417e+00);
+    h_ref_cby0(5) =
+        Kokkos::complex<double>(-7.230667452992603e+02, -1.602439974000479e+03);
+    h_ref_cby0(6) =
+        Kokkos::complex<double>(-7.230667452992603e+02, +1.602439974000479e+03);
+    h_ref_cby0(7) =
+        Kokkos::complex<double>(7.230667450987011e+02, -1.602439988435912e+03);
+    h_ref_cby0(8) =
+        Kokkos::complex<double>(7.230667450987011e+02, +1.602439988435912e+03);
+    h_ref_cby0(9) = Kokkos::complex<double>(3.768500100127903e-01, 0);
+    h_ref_cby0(10) =
+        Kokkos::complex<double>(3.768500100127903e-01, -5.201039098038670e-01);
+    h_ref_cby0(11) = Kokkos::complex<double>(-3.598179027370283e-02, 0);
+    h_ref_cby0(12) =
+        Kokkos::complex<double>(-3.598179027370282e-02, -3.248255626269732e-01);
+    h_ref_cby0(13) =
+        Kokkos::complex<double>(1.256239642409530e+03, -1.012912186329053e+03);
+    h_ref_cby0(14) =
+        Kokkos::complex<double>(1.256239642409530e+03, +1.012912186329053e+03);
+    h_ref_cby0(15) =
+        Kokkos::complex<double>(-1.256239629882755e+03, -1.012912190698863e+03);
+    h_ref_cby0(16) =
+        Kokkos::complex<double>(-1.256239629882755e+03, +1.012912190698863e+03);
+    h_ref_cby0(17) =
+        Kokkos::complex<double>(4.338202411482646e+02, -1.040215130736213e+03);
+    h_ref_cby0(18) =
+        Kokkos::complex<double>(4.338202411482646e+02, +1.040215130736213e+03);
+    h_ref_cby0(19) =
+        Kokkos::complex<double>(-4.338202362137545e+02, -1.040215138602435e+03);
+    h_ref_cby0(20) =
+        Kokkos::complex<double>(-4.338202362137545e+02, +1.040215138602435e+03);
+    h_ref_cby0(21) = Kokkos::complex<double>(1.318364704235323e-01, 0);
+    h_ref_cby0(22) =
+        Kokkos::complex<double>(1.318364704235323e-01, -1.463140210979992e-01);
+    h_ref_cby0(23) = Kokkos::complex<double>(4.735895220944939e-02, 0);
+    h_ref_cby0(24) =
+        Kokkos::complex<double>(4.735895220944938e-02, -1.829436081781237e-01);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj0(i) - h_ref_cbj0(i)),
+                Kokkos::abs(h_ref_cbj0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby0(0), h_cby0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby0(i) - h_ref_cby0(i)),
+                Kokkos::abs(h_ref_cby0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj0_large     = ViewType("d_cbj0_large", 6);
+    d_cby0_large     = ViewType("d_cby0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj0_large     = Kokkos::create_mirror_view(d_cbj0_large);
+    h_cby0_large     = Kokkos::create_mirror_view(d_cby0_large);
+    h_ref_cbj0_large = HostViewType("h_ref_cbj0_large", 2);
+    h_ref_cby0_large = HostViewType("h_ref_cby0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0_large, d_cbj0_large);
+    Kokkos::deep_copy(h_cby0_large, d_cby0_large);
+
+    h_ref_cbj0_large(0) =
+        Kokkos::complex<double>(-9.561811498244175e+40, -4.854995782103029e+40);
+    h_ref_cbj0_large(1) =
+        Kokkos::complex<double>(-9.561811498244175e+40, +4.854995782103029e+40);
+
+    h_ref_cby0_large(0) =
+        Kokkos::complex<double>(4.854995782103029e+40, -9.561811498244175e+40);
+    h_ref_cby0_large(1) =
+        Kokkos::complex<double>(-4.854995782103029e+40, -9.561811498244175e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) <
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) >
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(1) - h_ref_cbj0_large(0)) >
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(2) - h_ref_cbj0_large(0)) <
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) <
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) >
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(4) - h_ref_cbj0_large(1)) >
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(5) - h_ref_cbj0_large(1)) <
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) <
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) >
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(1) - h_ref_cby0_large(0)) >
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(2) - h_ref_cby0_large(0)) <
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) <
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) >
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(4) - h_ref_cby0_large(1)) >
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(5) - h_ref_cby0_large(1)) <
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj0(i) = Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby0(i) = Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj0_large(0) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj0_large(1) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj0_large(2) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj0_large(3) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj0_large(4) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj0_large(5) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby0_large(0) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby0_large(1) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby0_large(2) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby0_large(3) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby0_large(4) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby0_large(5) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ1Y1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj1, d_cby1;
+  typename ViewType::HostMirror h_z, h_cbj1, h_cby1;
+  HostViewType h_ref_cbj1, h_ref_cby1;
+
+  ViewType d_z_large, d_cbj1_large, d_cby1_large;
+  typename ViewType::HostMirror h_z_large, h_cbj1_large, h_cby1_large;
+  HostViewType h_ref_cbj1_large, h_ref_cby1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj1     = ViewType("d_cbj1", N);
+    d_cby1     = ViewType("d_cby1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj1     = Kokkos::create_mirror_view(d_cbj1);
+    h_cby1     = Kokkos::create_mirror_view(d_cby1);
+    h_ref_cbj1 = HostViewType("h_ref_cbj1", N);
+    h_ref_cby1 = HostViewType("h_ref_cby1", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1, d_cbj1);
+    Kokkos::deep_copy(h_cby1, d_cby1);
+
+    // Reference values computed with Octave
+    h_ref_cbj1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbj1(1) =
+        Kokkos::complex<double>(7.801488485792540e-01, -1.260982060238848e+00);
+    h_ref_cbj1(2) =
+        Kokkos::complex<double>(7.801488485792540e-01, +1.260982060238848e+00);
+    h_ref_cbj1(3) =
+        Kokkos::complex<double>(-7.801488485792543e-01, -1.260982060238848e+00);
+    h_ref_cbj1(4) =
+        Kokkos::complex<double>(-7.801488485792543e-01, +1.260982060238848e+00);
+    h_ref_cbj1(5) =
+        Kokkos::complex<double>(-7.469476253429664e+02, -1.576608505254311e+03);
+    h_ref_cbj1(6) =
+        Kokkos::complex<double>(-7.469476253429664e+02, +1.576608505254311e+03);
+    h_ref_cbj1(7) =
+        Kokkos::complex<double>(7.469476253429661e+02, -1.576608505254311e+03);
+    h_ref_cbj1(8) =
+        Kokkos::complex<double>(7.469476253429661e+02, +1.576608505254311e+03);
+    h_ref_cbj1(9) = Kokkos::complex<double>(3.390589585259365e-01, 0);
+    h_ref_cbj1(10) =
+        Kokkos::complex<double>(-3.390589585259365e-01, +3.373499138396203e-17);
+    h_ref_cbj1(11) = Kokkos::complex<double>(-3.951932188370151e-02, 0);
+    h_ref_cbj1(12) =
+        Kokkos::complex<double>(3.951932188370151e-02, +7.988560221984213e-18);
+    h_ref_cbj1(13) =
+        Kokkos::complex<double>(1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(14) =
+        Kokkos::complex<double>(1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(15) =
+        Kokkos::complex<double>(-1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(16) =
+        Kokkos::complex<double>(-1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(17) =
+        Kokkos::complex<double>(4.248029136732908e+02, -1.042364939115052e+03);
+    h_ref_cbj1(18) =
+        Kokkos::complex<double>(4.248029136732908e+02, +1.042364939115052e+03);
+    h_ref_cbj1(19) =
+        Kokkos::complex<double>(-4.248029136732909e+02, -1.042364939115052e+03);
+    h_ref_cbj1(20) =
+        Kokkos::complex<double>(-4.248029136732909e+02, +1.042364939115052e+03);
+    h_ref_cbj1(21) = Kokkos::complex<double>(1.305514883350938e-01, 0);
+    h_ref_cbj1(22) =
+        Kokkos::complex<double>(-1.305514883350938e-01, +7.993709105806192e-18);
+    h_ref_cbj1(23) = Kokkos::complex<double>(4.659838375816632e-02, 0);
+    h_ref_cbj1(24) =
+        Kokkos::complex<double>(-4.659838375816632e-02, +6.322680793358811e-18);
+
+    h_ref_cby1(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby1(1) =
+        Kokkos::complex<double>(1.285849341463599e+00, +7.250812532419394e-01);
+    h_ref_cby1(2) =
+        Kokkos::complex<double>(1.285849341463599e+00, -7.250812532419394e-01);
+    h_ref_cby1(3) =
+        Kokkos::complex<double>(1.236114779014097e+00, -8.352164439165690e-01);
+    h_ref_cby1(4) =
+        Kokkos::complex<double>(1.236114779014097e+00, +8.352164439165690e-01);
+    h_ref_cby1(5) =
+        Kokkos::complex<double>(1.576608512528508e+03, -7.469476251109801e+02);
+    h_ref_cby1(6) =
+        Kokkos::complex<double>(1.576608512528508e+03, +7.469476251109801e+02);
+    h_ref_cby1(7) =
+        Kokkos::complex<double>(1.576608497980113e+03, +7.469476255749524e+02);
+    h_ref_cby1(8) =
+        Kokkos::complex<double>(1.576608497980113e+03, -7.469476255749524e+02);
+    h_ref_cby1(9) = Kokkos::complex<double>(3.246744247918000e-01, 0);
+    h_ref_cby1(10) =
+        Kokkos::complex<double>(-3.246744247918000e-01, -6.781179170518730e-01);
+    h_ref_cby1(11) = Kokkos::complex<double>(1.616692009926331e-01, 0);
+    h_ref_cby1(12) =
+        Kokkos::complex<double>(-1.616692009926332e-01, +7.903864376740302e-02);
+    h_ref_cby1(13) =
+        Kokkos::complex<double>(1.027302268200224e+03, +1.233147093992241e+03);
+    h_ref_cby1(14) =
+        Kokkos::complex<double>(1.027302268200224e+03, -1.233147093992241e+03);
+    h_ref_cby1(15) =
+        Kokkos::complex<double>(1.027302263607999e+03, -1.233147106522383e+03);
+    h_ref_cby1(16) =
+        Kokkos::complex<double>(1.027302263607999e+03, +1.233147106522383e+03);
+    h_ref_cby1(17) =
+        Kokkos::complex<double>(1.042364943073579e+03, +4.248029112344685e+02);
+    h_ref_cby1(18) =
+        Kokkos::complex<double>(1.042364943073579e+03, -4.248029112344685e+02);
+    h_ref_cby1(19) =
+        Kokkos::complex<double>(1.042364935156525e+03, -4.248029161121132e+02);
+    h_ref_cby1(20) =
+        Kokkos::complex<double>(1.042364935156525e+03, +4.248029161121132e+02);
+    h_ref_cby1(21) = Kokkos::complex<double>(7.552212658226459e-02, 0);
+    h_ref_cby1(22) =
+        Kokkos::complex<double>(-7.552212658226459e-02, -2.611029766701876e-01);
+    h_ref_cby1(23) = Kokkos::complex<double>(9.186960936986688e-02, 0);
+    h_ref_cby1(24) =
+        Kokkos::complex<double>(-9.186960936986688e-02, -9.319676751633262e-02);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj1(i) - h_ref_cbj1(i)),
+                Kokkos::abs(h_ref_cbj1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby1(0), h_cby1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby1(i) - h_ref_cby1(i)),
+                Kokkos::abs(h_ref_cby1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj1_large     = ViewType("d_cbj1_large", 6);
+    d_cby1_large     = ViewType("d_cby1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj1_large     = Kokkos::create_mirror_view(d_cbj1_large);
+    h_cby1_large     = Kokkos::create_mirror_view(d_cby1_large);
+    h_ref_cbj1_large = HostViewType("h_ref_cbj1_large", 2);
+    h_ref_cby1_large = HostViewType("h_ref_cby1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1_large, d_cbj1_large);
+    Kokkos::deep_copy(h_cby1_large, d_cby1_large);
+
+    h_ref_cbj1_large(0) =
+        Kokkos::complex<double>(4.854515317906369e+40, -9.562049455402486e+40);
+    h_ref_cbj1_large(1) =
+        Kokkos::complex<double>(-4.854515317906371e+40, -9.562049455402486e+40);
+
+    h_ref_cby1_large(0) =
+        Kokkos::complex<double>(9.562049455402486e+40, 4.854515317906369e+40);
+    h_ref_cby1_large(1) =
+        Kokkos::complex<double>(9.562049455402486e+40, -4.854515317906369e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) <
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) >
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(1) - h_ref_cbj1_large(0)) >
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(2) - h_ref_cbj1_large(0)) <
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) <
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) >
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(4) - h_ref_cbj1_large(1)) >
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(5) - h_ref_cbj1_large(1)) <
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) <
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) >
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(1) - h_ref_cby1_large(0)) >
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(2) - h_ref_cby1_large(0)) <
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) <
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) >
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(4) - h_ref_cby1_large(1)) >
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(5) - h_ref_cby1_large(1)) <
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj1(i) = Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby1(i) = Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj1_large(0) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj1_large(1) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj1_large(2) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj1_large(3) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj1_large(4) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj1_large(5) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby1_large(0) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby1_large(1) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby1_large(2) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby1_large(3) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby1_large(4) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby1_large(5) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI0K0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi0, d_cbk0;
+  typename ViewType::HostMirror h_z, h_cbi0, h_cbk0;
+  HostViewType h_ref_cbi0, h_ref_cbk0;
+
+  ViewType d_z_large, d_cbi0_large, d_cbk0_large;
+  typename ViewType::HostMirror h_z_large, h_cbi0_large, h_cbk0_large;
+  HostViewType h_ref_cbi0_large, h_ref_cbk0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi0     = ViewType("d_cbi0", N);
+    d_cbk0     = ViewType("d_cbk0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi0     = Kokkos::create_mirror_view(d_cbi0);
+    h_cbk0     = Kokkos::create_mirror_view(d_cbk0);
+    h_ref_cbi0 = HostViewType("h_ref_cbi0", N);
+    h_ref_cbk0 = HostViewType("h_ref_cbk0", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0, d_cbi0);
+    Kokkos::deep_copy(h_cbk0, d_cbk0);
+
+    // Reference values computed with Octave
+    h_ref_cbi0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbi0(1) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(2) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(3) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(4) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(5) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(6) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(7) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(8) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(9)  = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(10) = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(11) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(12) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(13) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(14) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(15) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(16) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(17) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(18) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(19) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(20) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(21) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(22) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(23) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+    h_ref_cbi0(24) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+
+    h_ref_cbk0(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk0(1) =
+        Kokkos::complex<double>(-2.078722558742977e-02, -2.431266356716766e-02);
+    h_ref_cbk0(2) =
+        Kokkos::complex<double>(-2.078722558742977e-02, +2.431266356716766e-02);
+    h_ref_cbk0(3) =
+        Kokkos::complex<double>(-1.357295320191579e+01, +1.499344424826928e+00);
+    h_ref_cbk0(4) =
+        Kokkos::complex<double>(-1.357295320191579e+01, -1.499344424826928e+00);
+    h_ref_cbk0(5) =
+        Kokkos::complex<double>(-1.820476218131465e-11, +1.795056004780177e-11);
+    h_ref_cbk0(6) =
+        Kokkos::complex<double>(-1.820476218131465e-11, -1.795056004780177e-11);
+    h_ref_cbk0(7) =
+        Kokkos::complex<double>(8.816423633943287e+08, +2.285988078870750e+09);
+    h_ref_cbk0(8) =
+        Kokkos::complex<double>(8.816423633943287e+08, -2.285988078870750e+09);
+    h_ref_cbk0(9) = Kokkos::complex<double>(3.473950438627926e-02, 0);
+    h_ref_cbk0(10) =
+        Kokkos::complex<double>(3.473950438627926e-02, -1.533346213144909e+01);
+    h_ref_cbk0(11) = Kokkos::complex<double>(2.667545110351910e-11, 0);
+    h_ref_cbk0(12) =
+        Kokkos::complex<double>(2.667545110351910e-11, -2.560844503718094e+09);
+    h_ref_cbk0(13) =
+        Kokkos::complex<double>(-1.163319528590747e-13, +1.073711234918388e-13);
+    h_ref_cbk0(14) =
+        Kokkos::complex<double>(-1.163319528590747e-13, -1.073711234918388e-13);
+    h_ref_cbk0(15) =
+        Kokkos::complex<double>(1.306638772421339e+11, +3.071215726177843e+11);
+    h_ref_cbk0(16) =
+        Kokkos::complex<double>(1.306638772421339e+11, -3.071215726177843e+11);
+    h_ref_cbk0(17) =
+        Kokkos::complex<double>(-1.111584549467388e-27, +8.581979311477652e-28);
+    h_ref_cbk0(18) =
+        Kokkos::complex<double>(-1.111584549467388e-27, -8.581979311477652e-28);
+    h_ref_cbk0(19) =
+        Kokkos::complex<double>(8.691857147870108e+24, +1.620552106793022e+25);
+    h_ref_cbk0(20) =
+        Kokkos::complex<double>(8.691857147870108e+24, -1.620552106793022e+25);
+    h_ref_cbk0(21) = Kokkos::complex<double>(1.630534586888181e-13, 0);
+    h_ref_cbk0(22) =
+        Kokkos::complex<double>(1.630534586888181e-13, -3.441131095391506e+11);
+    h_ref_cbk0(23) = Kokkos::complex<double>(1.413897840559108e-27, 0);
+    h_ref_cbk0(24) =
+        Kokkos::complex<double>(1.413897840559108e-27, -1.851678917759592e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi0(i) - h_ref_cbi0(i)),
+                Kokkos::abs(h_ref_cbi0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
+                Kokkos::abs(h_ref_cbk0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi0_large     = ViewType("d_cbi0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi0_large     = Kokkos::create_mirror_view(d_cbi0_large);
+    h_ref_cbi0_large = HostViewType("h_ref_cbi0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0_large, d_cbi0_large);
+
+    h_ref_cbi0_large(0) =
+        Kokkos::complex<double>(-9.266819049505678e+41, -5.370779383266049e+41);
+    h_ref_cbi0_large(1) =
+        Kokkos::complex<double>(-9.266819049505678e+41, +5.370779383266049e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(0) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(1) - h_ref_cbi0_large(0)) >
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(2) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(3) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(4) - h_ref_cbi0_large(1)) >
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(5) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi0(i) = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk0(i) = Kokkos::Experimental::cyl_bessel_k0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi0_large(0) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi0_large(1) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi0_large(2) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi0_large(3) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi0_large(4) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi0_large(5) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI1K1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi1, d_cbk1;
+  typename ViewType::HostMirror h_z, h_cbi1, h_cbk1;
+  HostViewType h_ref_cbi1, h_ref_cbk1;
+
+  ViewType d_z_large, d_cbi1_large, d_cbk1_large;
+  typename ViewType::HostMirror h_z_large, h_cbi1_large, h_cbk1_large;
+  HostViewType h_ref_cbi1_large, h_ref_cbk1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi1     = ViewType("d_cbi1", N);
+    d_cbk1     = ViewType("d_cbk1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi1     = Kokkos::create_mirror_view(d_cbi1);
+    h_cbk1     = Kokkos::create_mirror_view(d_cbk1);
+    h_ref_cbi1 = HostViewType("h_ref_cbi1", N);
+    h_ref_cbk1 = HostViewType("h_ref_cbk1", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1, d_cbi1);
+    Kokkos::deep_copy(h_cbk1, d_cbk1);
+
+    // Reference values computed with Octave
+    h_ref_cbi1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbi1(1) =
+        Kokkos::complex<double>(-8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(2) =
+        Kokkos::complex<double>(-8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(3) =
+        Kokkos::complex<double>(8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(4) =
+        Kokkos::complex<double>(8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(5) =
+        Kokkos::complex<double>(-7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(6) =
+        Kokkos::complex<double>(-7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(7) =
+        Kokkos::complex<double>(7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(8) =
+        Kokkos::complex<double>(7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(9)  = Kokkos::complex<double>(3.953370217402609e+00, 0);
+    h_ref_cbi1(10) = Kokkos::complex<double>(-3.953370217402609e+00, 0);
+    h_ref_cbi1(11) = Kokkos::complex<double>(7.972200260896506e+08, 0);
+    h_ref_cbi1(12) = Kokkos::complex<double>(-7.972200260896506e+08, 0);
+    h_ref_cbi1(13) =
+        Kokkos::complex<double>(-9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(14) =
+        Kokkos::complex<double>(-9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(15) =
+        Kokkos::complex<double>(9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(16) =
+        Kokkos::complex<double>(9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(17) =
+        Kokkos::complex<double>(-5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(18) =
+        Kokkos::complex<double>(-5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(19) =
+        Kokkos::complex<double>(5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(20) =
+        Kokkos::complex<double>(5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(21) = Kokkos::complex<double>(1.075605042080823e+11, 0);
+    h_ref_cbi1(22) = Kokkos::complex<double>(-1.075605042080823e+11, 0);
+    h_ref_cbi1(23) = Kokkos::complex<double>(5.844751588390470e+24, 0);
+    h_ref_cbi1(24) = Kokkos::complex<double>(-5.844751588390470e+24, 0);
+
+    h_ref_cbk1(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk1(1) =
+        Kokkos::complex<double>(-2.480952007015153e-02, -2.557074905635180e-02);
+    h_ref_cbk1(2) =
+        Kokkos::complex<double>(-2.480952007015153e-02, +2.557074905635180e-02);
+    h_ref_cbk1(3) =
+        Kokkos::complex<double>(-1.185255629692602e+01, +2.527855884398198e+00);
+    h_ref_cbk1(4) =
+        Kokkos::complex<double>(-1.185255629692602e+01, -2.527855884398198e+00);
+    h_ref_cbk1(5) =
+        Kokkos::complex<double>(-1.839497240093994e-11, +1.841855854336314e-11);
+    h_ref_cbk1(6) =
+        Kokkos::complex<double>(-1.839497240093994e-11, -1.841855854336314e-11);
+    h_ref_cbk1(7) =
+        Kokkos::complex<double>(8.839236534393319e+08, +2.236734153323357e+09);
+    h_ref_cbk1(8) =
+        Kokkos::complex<double>(8.839236534393319e+08, -2.236734153323357e+09);
+    h_ref_cbk1(9) = Kokkos::complex<double>(4.015643112819419e-02, 0);
+    h_ref_cbk1(10) =
+        Kokkos::complex<double>(-4.015643112819419e-02, -1.241987883191272e+01);
+    h_ref_cbk1(11) = Kokkos::complex<double>(2.724930589574976e-11, 0);
+    h_ref_cbk1(12) =
+        Kokkos::complex<double>(-2.724930589574976e-11, -2.504540577257910e+09);
+    h_ref_cbk1(13) =
+        Kokkos::complex<double>(-1.175637676331817e-13, +1.097080943197297e-13);
+    h_ref_cbk1(14) =
+        Kokkos::complex<double>(-1.175637676331817e-13, -1.097080943197297e-13);
+    h_ref_cbk1(15) =
+        Kokkos::complex<double>(1.303458736323849e+11, +3.014719661500124e+11);
+    h_ref_cbk1(16) =
+        Kokkos::complex<double>(1.303458736323849e+11, -3.014719661500124e+11);
+    h_ref_cbk1(17) =
+        Kokkos::complex<double>(-1.119411861396158e-27, +8.666195226392352e-28);
+    h_ref_cbk1(18) =
+        Kokkos::complex<double>(-1.119411861396158e-27, -8.666195226392352e-28);
+    h_ref_cbk1(19) =
+        Kokkos::complex<double>(8.643181853549355e+24, +1.606175559143138e+25);
+    h_ref_cbk1(20) =
+        Kokkos::complex<double>(8.643181853549355e+24, -1.606175559143138e+25);
+    h_ref_cbk1(21) = Kokkos::complex<double>(1.659400107332009e-13, 0);
+    h_ref_cbk1(22) =
+        Kokkos::complex<double>(-1.659400107332009e-13, -3.379112898365253e+11);
+    h_ref_cbk1(23) = Kokkos::complex<double>(1.425632026517104e-27, 0);
+    h_ref_cbk1(24) =
+        Kokkos::complex<double>(-1.425632026517104e-27, -1.836182865214478e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi1(i) - h_ref_cbi1(i)),
+                Kokkos::abs(h_ref_cbi1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
+                Kokkos::abs(h_ref_cbk1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi1_large     = ViewType("d_cbi1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi1_large     = Kokkos::create_mirror_view(d_cbi1_large);
+    h_ref_cbi1_large = HostViewType("h_ref_cbi1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1_large, d_cbi1_large);
+
+    h_ref_cbi1_large(0) =
+        Kokkos::complex<double>(-9.218158020154234e+41, -5.348736158968607e+41);
+    h_ref_cbi1_large(1) =
+        Kokkos::complex<double>(9.218158020154234e+41, -5.348736158968607e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(0) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(1) - h_ref_cbi1_large(0)) >
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(2) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(3) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(4) - h_ref_cbi1_large(1)) >
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(5) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi1(i) = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk1(i) = Kokkos::Experimental::cyl_bessel_k1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi1_large(0) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi1_large(1) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi1_large(2) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi1_large(3) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi1_large(4) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi1_large(5) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch10, d_ch11;
+  typename ViewType::HostMirror h_z, h_ch10, h_ch11;
+  HostViewType h_ref_ch10, h_ref_ch11;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch10     = ViewType("d_ch10", N);
+    d_ch11     = ViewType("d_ch11", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch10     = Kokkos::create_mirror_view(d_ch10);
+    h_ch11     = Kokkos::create_mirror_view(d_ch11);
+    h_ref_ch10 = HostViewType("h_ref_ch10", N);
+    h_ref_ch11 = HostViewType("h_ref_ch11", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch10, d_ch10);
+    Kokkos::deep_copy(h_ch11, d_ch11);
+
+    // Reference values computed with Octave
+    h_ref_ch10(0) = Kokkos::complex<double>(1.0, -infinity<double>::value);
+    h_ref_ch10(1) =
+        Kokkos::complex<double>(-1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(2) =
+        Kokkos::complex<double>(-2.480676488910849e+00, +1.948786988612626e+00);
+    h_ref_ch10(3) =
+        Kokkos::complex<double>(1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(4) =
+        Kokkos::complex<double>(-2.516263029518839e+00, -1.843148179618315e+00);
+    h_ref_ch10(5) =
+        Kokkos::complex<double>(-7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(6) =
+        Kokkos::complex<double>(-3.204879955218674e+03, -1.446133490498241e+03);
+    h_ref_ch10(7) =
+        Kokkos::complex<double>(7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(8) =
+        Kokkos::complex<double>(-3.204879969654108e+03, +1.446133490297682e+03);
+    h_ref_ch10(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(10) =
+        Kokkos::complex<double>(2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(12) =
+        Kokkos::complex<double>(1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(13) =
+        Kokkos::complex<double>(-2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(14) =
+        Kokkos::complex<double>(-2.025824374843011e+03, +2.512479278555672e+03);
+    h_ref_ch10(15) =
+        Kokkos::complex<double>(2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(16) =
+        Kokkos::complex<double>(-2.025824379212821e+03, -2.512479266028897e+03);
+    h_ref_ch10(17) =
+        Kokkos::complex<double>(-1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(18) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch10(19) =
+        Kokkos::complex<double>(1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(20) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch10(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(22) =
+        Kokkos::complex<double>(7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, -5.426577524981793e-02);
+    h_ref_ch10(24) =
+        Kokkos::complex<double>(1.543743993056510e-02, -5.426577524981793e-02);
+
+    h_ref_ch11(0) = Kokkos::complex<double>(0.0, -infinity<double>::value);
+    h_ref_ch11(1) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch11(2) =
+        Kokkos::complex<double>(1.505230101821194e+00, +2.546831401702448e+00);
+    h_ref_ch11(3) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch11(4) =
+        Kokkos::complex<double>(-1.615365292495823e+00, +2.497096839252946e+00);
+    h_ref_ch11(5) =
+        Kokkos::complex<double>(-2.319863729607219e-07, +7.274197719836158e-06);
+    h_ref_ch11(6) =
+        Kokkos::complex<double>(-1.493895250453947e+03, +3.153217017782819e+03);
+    h_ref_ch11(7) =
+        Kokkos::complex<double>(-2.319863729607210e-07, -7.274197719836158e-06);
+    h_ref_ch11(8) =
+        Kokkos::complex<double>(1.493895250917918e+03, +3.153217003234423e+03);
+    h_ref_ch11(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, +3.246744247918000e-01);
+    h_ref_ch11(10) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch11(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, +1.616692009926331e-01);
+    h_ref_ch11(12) =
+        Kokkos::complex<double>(-3.951932188370151e-02, -1.616692009926331e-01);
+    h_ref_ch11(13) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347948e-06);
+    h_ref_ch11(14) =
+        Kokkos::complex<double>(2.466294194249553e+03, +2.054604534104335e+03);
+    h_ref_ch11(15) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347947e-06);
+    h_ref_ch11(16) =
+        Kokkos::complex<double>(-2.466294206779695e+03, +2.054604529512110e+03);
+    h_ref_ch11(17) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825768e-28);
+    h_ref_ch11(18) =
+        Kokkos::complex<double>(-6.250095237987940e+24, +8.112776606830997e+23);
+    h_ref_ch11(19) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825769e-28);
+    h_ref_ch11(20) =
+        Kokkos::complex<double>(6.250095237987940e+24, +8.112776606831005e+23);
+    h_ref_ch11(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, +7.552212658226459e-02);
+    h_ref_ch11(22) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226456e-02);
+    h_ref_ch11(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, +1.530182458038999e-02);
+    h_ref_ch11(24) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
+    std::cout << "h_ch10(0): " << h_ch10(0)
+              << ", h_ref_ch10(0): " << h_ref_ch10(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
+                Kokkos::abs(h_ref_ch10(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch10(i) - h_ref_ch10(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch10(i)) * 1e-13
+                << std::endl;
+    }
+
+    EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
+    std::cout << "h_ch11(0): " << h_ch11(0)
+              << ", h_ref_ch11(0): " << h_ref_ch11(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
+                Kokkos::abs(h_ref_ch11(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch11(i) - h_ref_ch11(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch11(i)) * 1e-13
+                << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch10(i) = Kokkos::Experimental::cyl_bessel_h10(d_z(i));
+    d_ch11(i) = Kokkos::Experimental::cyl_bessel_h11(d_z(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH2Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch20, d_ch21;
+  typename ViewType::HostMirror h_z, h_ch20, h_ch21;
+  HostViewType h_ref_ch20, h_ref_ch21;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch20     = ViewType("d_ch20", N);
+    d_ch21     = ViewType("d_ch21", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch20     = Kokkos::create_mirror_view(d_ch20);
+    h_ch21     = Kokkos::create_mirror_view(d_ch21);
+    h_ref_ch20 = HostViewType("h_ref_ch20", N);
+    h_ref_ch21 = HostViewType("h_ref_ch21", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch20, d_ch20);
+    Kokkos::deep_copy(h_ch21, d_ch21);
+
+    // Reference values computed with Octave
+    h_ref_ch20(0) = Kokkos::complex<double>(1.0, infinity<double>::value);
+    h_ref_ch20(1) =
+        Kokkos::complex<double>(-2.480676488910849e+00, -1.948786988612626e+00);
+    h_ref_ch20(2) =
+        Kokkos::complex<double>(-1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(3) =
+        Kokkos::complex<double>(-2.516263029518839e+00, +1.843148179618315e+00);
+    h_ref_ch20(4) =
+        Kokkos::complex<double>(1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(5) =
+        Kokkos::complex<double>(-3.204879955218674e+03, +1.446133490498241e+03);
+    h_ref_ch20(6) =
+        Kokkos::complex<double>(-7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(7) =
+        Kokkos::complex<double>(-3.204879969654108e+03, -1.446133490297682e+03);
+    h_ref_ch20(8) =
+        Kokkos::complex<double>(7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, -3.768500100127903e-01);
+    h_ref_ch20(10) =
+        Kokkos::complex<double>(-7.801558647058006e-01, -3.768500100127903e-01);
+    h_ref_ch20(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, +3.598179027370283e-02);
+    h_ref_ch20(12) =
+        Kokkos::complex<double>(-4.872383439404597e-01, +3.598179027370281e-02);
+    h_ref_ch20(13) =
+        Kokkos::complex<double>(-2.025824374843011e+03, -2.512479278555672e+03);
+    h_ref_ch20(14) =
+        Kokkos::complex<double>(-2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(15) =
+        Kokkos::complex<double>(-2.025824379212821e+03, +2.512479266028897e+03);
+    h_ref_ch20(16) =
+        Kokkos::complex<double>(2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(17) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch20(18) =
+        Kokkos::complex<double>(-1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(19) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch20(20) =
+        Kokkos::complex<double>(1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, -1.318364704235323e-01);
+    h_ref_ch20(22) =
+        Kokkos::complex<double>(-2.194710316469988e-01, -1.318364704235323e-01);
+    h_ref_ch20(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, +5.426577524981793e-02);
+    h_ref_ch20(24) =
+        Kokkos::complex<double>(-4.631231979169528e-02, +5.426577524981793e-02);
+
+    h_ref_ch21(0) = Kokkos::complex<double>(0.0, infinity<double>::value);
+    h_ref_ch21(1) =
+        Kokkos::complex<double>(1.505230101821194e+00, -2.546831401702448e+00);
+    h_ref_ch21(2) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch21(3) =
+        Kokkos::complex<double>(-1.615365292495823e+00, -2.497096839252946e+00);
+    h_ref_ch21(4) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch21(5) =
+        Kokkos::complex<double>(-1.493895250453947e+03, -3.153217017782819e+03);
+    h_ref_ch21(6) =
+        Kokkos::complex<double>(-2.319863729607219e-07, -7.274197719836158e-06);
+    h_ref_ch21(7) =
+        Kokkos::complex<double>(1.493895250917918e+03, -3.153217003234423e+03);
+    h_ref_ch21(8) =
+        Kokkos::complex<double>(-2.319863729607210e-07, +7.274197719836158e-06);
+    h_ref_ch21(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch21(10) =
+        Kokkos::complex<double>(-1.017176875577809e+00, +3.246744247918000e-01);
+    h_ref_ch21(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, -1.616692009926331e-01);
+    h_ref_ch21(12) =
+        Kokkos::complex<double>(1.185579656511045e-01, +1.616692009926332e-01);
+    h_ref_ch21(13) =
+        Kokkos::complex<double>(2.466294194249553e+03, -2.054604534104335e+03);
+    h_ref_ch21(14) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347948e-06);
+    h_ref_ch21(15) =
+        Kokkos::complex<double>(-2.466294206779695e+03, -2.054604529512110e+03);
+    h_ref_ch21(16) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347947e-06);
+    h_ref_ch21(17) =
+        Kokkos::complex<double>(-6.250095237987940e+24, -8.112776606830997e+23);
+    h_ref_ch21(18) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825768e-28);
+    h_ref_ch21(19) =
+        Kokkos::complex<double>(6.250095237987940e+24, -8.112776606831005e+23);
+    h_ref_ch21(20) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825769e-28);
+    h_ref_ch21(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226459e-02);
+    h_ref_ch21(22) =
+        Kokkos::complex<double>(-3.916544650052814e-01, +7.552212658226461e-02);
+    h_ref_ch21(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458038999e-02);
+    h_ref_ch21(24) =
+        Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
+                Kokkos::abs(h_ref_ch20(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
+                Kokkos::abs(h_ref_ch21(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch20(i) = Kokkos::Experimental::cyl_bessel_h20(d_z(i));
+    d_ch21(i) = Kokkos::Experimental::cyl_bessel_h21(d_z(i));
+  }
+};
+
+TEST(TEST_CATEGORY, mathspecialfunc_expint1) {
+  TestExponentialIntergral1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) {
+  TestComplexErrorFunction<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) {
+  TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) {
+  TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) {
+  TestComplexBesselI0K0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) {
+  TestComplexBesselI1K1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) {
+  TestComplexBesselH1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) {
+  TestComplexBesselH2Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index 63895ad47d..829e8d641a 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -50,7 +50,7 @@
 #include <cmath>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace TestMemoryPool {
 
diff --git a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index 6c8a47a586..d7607c4f71 100644
--- a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -48,7 +48,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -310,6 +310,46 @@ struct array_reduce {
     return lsum;
   }
 };
+
+struct point_t {
+  uint8_t x, y, z;
+
+  KOKKOS_FUNCTION
+  point_t() : x(1), y(1), z(1){};
+
+  KOKKOS_FUNCTION
+  point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const volatile point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const int rhs) { x = y = z = static_cast<uint8_t>(rhs); }
+
+  KOKKOS_FUNCTION
+  explicit operator int() const { return static_cast<int>(x + y + z); }
+
+  KOKKOS_FUNCTION
+  bool operator==(const volatile point_t rhs) const volatile {
+    return (x == rhs.x && y == rhs.y && z == rhs.z);
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(point_t rhs) volatile {
+    x = rhs.x;
+    y = rhs.y;
+    z = rhs.z;
+  }
+
+  KOKKOS_FUNCTION
+  volatile point_t operator+=(const volatile point_t rhs) volatile {
+    x += rhs.x;
+    y += rhs.y;
+    z += rhs.z;
+    return *this;
+  }
+};
+
 }  // namespace Test
 
 namespace Kokkos {
@@ -334,5 +374,21 @@ struct reduction_identity<Test::array_reduce<scalar_t, N>> {
     return Test::array_reduce<scalar_t, N>(t_red_ident::prod());
   }
 };
+
+template <>
+struct reduction_identity<Test::point_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t sum() noexcept {
+    return 0;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t prod() noexcept {
+    return 1;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t max() noexcept {
+    return 0xff;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t min() noexcept {
+    return 0x0;
+  }
+};
 }  // namespace Kokkos
 #endif  // TESTNONTRIVIALSCALARTYPES_HPP_
diff --git a/lib/kokkos/core/unit_test/TestNumericTraits.hpp b/lib/kokkos/core/unit_test/TestNumericTraits.hpp
index fe01b83834..cb69cb8321 100644
--- a/lib/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/lib/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -46,6 +46,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <type_traits>
+#include <limits>
 #include "Kokkos_NumericTraits.hpp"
 #include "Kokkos_ExecPolicy.hpp"
 
@@ -198,7 +199,9 @@ struct TestNumericTraits<
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1 see issue #4100
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
@@ -334,3 +337,182 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 }
+
+namespace NumericTraitsSFINAE {
+
+struct HasNoSpecialization {};
+
+#define CHECK_TRAIT_IS_SFINAE_FRIENDLY(TRAIT)                              \
+  template <class T>                                                       \
+  using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \
+  template <class T>                                                       \
+  using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>;             \
+  static_assert(!has_##TRAIT<HasNoSpecialization>::value, "");
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_max)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(epsilon)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(round_error)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(norm_min)
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(radix)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent10)
+
+}  // namespace NumericTraitsSFINAE
+
+// Example detecting presence or absence of values
+template <class T>
+using infinity_value_t = decltype(Kokkos::Experimental::infinity<T>::value);
+
+template <class T>
+using has_infinity = Kokkos::is_detected<infinity_value_t, T>;
+
+template <class T, std::enable_if_t<has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return Kokkos::Experimental::infinity<T>::value;
+}
+
+template <class T, std::enable_if_t<!has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return T();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_sfinae_friendly) {
+  ASSERT_EQ(legacy_std_numeric_limits_infinity<int>(), 0);
+}
+
+// Compare to std::numeric_limits
+template <int V1, int V2>
+struct AssertIntEquality {
+  static constexpr bool value = false;
+};
+template <int V>
+struct AssertIntEquality<V, V> {
+  static constexpr bool value = true;
+};
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(T, TRAIT)           \
+  static_assert(AssertIntEquality<Kokkos::Experimental::TRAIT<T>::value, \
+                                  std::numeric_limits<T>::TRAIT>::value, \
+                "")
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value ==       \
+                    std::numeric_limits<T>::TRAIT(),           \
+                "")
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, epsilon);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, epsilon);
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, epsilon);
+#endif
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error);
+// clang-format off
+static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), "");
+// integer types
+static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), "");
+// floating point types
+static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max(), "");
+// clang-format on
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
+
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 0017c690e7..d75d78b31f 100644
--- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -291,34 +291,34 @@ class TestRangePolicyConstruction {
     using policy_t = Kokkos::RangePolicy<>;
     {
       policy_t p(5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p;
-      ASSERT_TRUE((p.begin() == 0));
-      ASSERT_TRUE((p.end() == 0));
+      ASSERT_EQ(p.begin(), 0);
+      ASSERT_EQ(p.end(), 0);
       p = policy_t(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
   }
 };
@@ -582,7 +582,7 @@ class TestTeamPolicyConstruction {
     ASSERT_EQ(p1.team_size(), team_size);
 // FIXME_SYCL implement chunk_size
 #ifndef KOKKOS_ENABLE_SYCL
-    ASSERT_TRUE(p1.chunk_size() > 0);
+    ASSERT_GT(p1.chunk_size(), 0);
 #endif
     ASSERT_EQ(p1.scratch_size(0), 0);
 
@@ -795,7 +795,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert(sizeof(decltype(policy)) == 1, "");
   static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))>
       _assert1{};
-  (void)_assert1;  // avoid unused variable warning
+  (void)&_assert1;  // avoid unused variable warning
 
   using Kokkos::Experimental::DesiredOccupancy;
   auto policy_with_occ =
@@ -805,7 +805,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert_dummy_policy_must_be_size_of_desired_occupancy<
       sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)>
       _assert2{};
-  (void)_assert2;  // avoid unused variable warning
+  (void)&_assert2;  // avoid unused variable warning
 }
 
 template <typename Policy>
diff --git a/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
new file mode 100644
index 0000000000..e45d84e7e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_LIBQUADMATH
+
+#include <impl/Kokkos_QuadPrecisionMath.hpp>
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+// FIXME instantiate only once for default host execution space
+TEST(TEST_CATEGORY, quad_precision_reductions) {
+  int const n = 100;
+  __float128 r;
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      r);
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      Kokkos::Sum<__float128>(r));
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v > static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Min<__float128>(r));
+  EXPECT_EQ(r, 0);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v < static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Max<__float128>(r));
+  EXPECT_EQ(r, n - 1);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(1, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v *= static_cast<__float128>(i); },
+      Kokkos::Prod<__float128>(r));
+  EXPECT_FLOAT_EQ(r, tgammaq(n + 1));  // factorial(n) = tgamma(n+1)
+}
+
+TEST(TEST_CATEGORY, quad_precision_common_math_functions) {
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        (void)Kokkos::Experimental::fabs((__float128)0);
+        (void)Kokkos::Experimental::sqrt((__float128)1);
+        (void)Kokkos::Experimental::exp((__float128)2);
+        (void)Kokkos::Experimental::sin((__float128)3);
+        (void)Kokkos::Experimental::cosh((__float128)4);
+      });
+}
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index a6a6220f2d..d6b5d8fecc 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -317,10 +317,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -361,10 +361,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 693f19613d..508b7192cb 100644
--- a/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -309,10 +309,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -353,10 +353,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
index 5f7fbd5623..81e063f83e 100644
--- a/lib/kokkos/core/unit_test/TestReduce.hpp
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -539,6 +539,10 @@ class TestReduceDynamicView {
 
 }  // namespace
 
+// FIXME_OPENMPTARGET : The feature works with LLVM/13 on NVIDIA
+// architectures. The jenkins currently tests with LLVM/12.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1300)
 TEST(TEST_CATEGORY, int64_t_reduce) {
   TestReduce<int64_t, TEST_EXECSPACE>(0);
   TestReduce<int64_t, TEST_EXECSPACE>(1000000);
@@ -563,7 +567,10 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(0);
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(1000000);
 }
+#endif
 
+// FIXME_OPENMPTARGET: Not yet implemented.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, int_combined_reduce) {
   using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
   constexpr uint64_t nw = 1000;
@@ -626,4 +633,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   ASSERT_EQ(nsum, result2);
   ASSERT_EQ(nsum, result3_v());
 }
+#endif
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index 68e7d746dd..4664f26559 100644
--- a/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -439,11 +439,11 @@ struct TestReduceCombinatoricalInstantiation {
                        Test::ReduceCombinatorical::AddPlus<double>(value));
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
@@ -453,11 +453,11 @@ struct TestReduceCombinatoricalInstantiation {
     CallParallelReduce(args..., add);
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
diff --git a/lib/kokkos/core/unit_test/TestReducers.hpp b/lib/kokkos/core/unit_test/TestReducers.hpp
index 35f0e231fd..0d5f7fe7ba 100644
--- a/lib/kokkos/core/unit_test/TestReducers.hpp
+++ b/lib/kokkos/core/unit_test/TestReducers.hpp
@@ -296,7 +296,8 @@ struct TestReducers {
     Scalar reference_sum = 0;
 
     for (int i = 0; i < N; i++) {
-      h_values(i) = (Scalar)(rand() % 100);
+      int denom   = sizeof(Scalar) <= 2 ? 10 : 100;
+      h_values(i) = (Scalar)(rand() % denom);
       reference_sum += h_values(i);
     }
     Kokkos::deep_copy(values, h_values);
diff --git a/lib/kokkos/core/unit_test/TestReducers_d.hpp b/lib/kokkos/core/unit_test/TestReducers_d.hpp
index e2254a1c1f..2d5802cdd4 100644
--- a/lib/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/lib/kokkos/core/unit_test/TestReducers_d.hpp
@@ -64,4 +64,49 @@ TEST(TEST_CATEGORY, reducers_struct) {
   TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031);
 #endif
 }
+
+TEST(TEST_CATEGORY, reducers_half_t) {
+  using ThisTestType = Kokkos::Experimental::half_t;
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(101);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(202);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(303);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
+}
+
+TEST(TEST_CATEGORY, reducers_int8_t) {
+  using ThisTestType = int8_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
+#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+// TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
+//                 implicitly-deleted default constructor of 'conv_type'
+//                   conv_type tmp_in;"
+//
+// TODO - resolve:  4: [  FAILED  ] openmptarget.reducers_point_t (1 ms)
+TEST(TEST_CATEGORY, reducers_point_t) {
+  using ThisTestType = point_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+}
+#endif  // !KOKKOS_ENABLE_HIP && !KOKKOS_ENABLE_OPENMPTARGET
+
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReductions.hpp b/lib/kokkos/core/unit_test/TestReductions.hpp
index 949ca7eaf3..1fa8a2e92e 100644
--- a/lib/kokkos/core/unit_test/TestReductions.hpp
+++ b/lib/kokkos/core/unit_test/TestReductions.hpp
@@ -45,8 +45,6 @@
 #ifndef KOKKOS_TEST_REDUCTIONS_HPP
 #define KOKKOS_TEST_REDUCTIONS_HPP
 #include <Kokkos_Macros.hpp>
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
 #include <TestReduce.hpp>
-#endif
 #include <TestCXX11Deduction.hpp>
 #endif
diff --git a/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
index 17563de335..6ffa11b11c 100644
--- a/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
+++ b/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
@@ -32,11 +32,17 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
   typename ExecSpace::execution_space().fence();
   double time_fence0 = timer.seconds();
   Kokkos::deep_copy(result, 0);
+
+  // We need a warm-up to get reasonable results
+  Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
+                          functor,
+                          Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
+  Kokkos::fence();
+
   timer.reset();
   bool is_async = time0 < time_fence0;
 
   // Test Reducer
-
   Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
                           functor,
                           Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
@@ -75,11 +81,11 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
 
   ASSERT_EQ(N, scalar_result);
   if (is_async) {
-    ASSERT_TRUE(time1 < time_fence1);
+    ASSERT_LT(time1, time_fence1);
   }
   if (is_async) {
-    ASSERT_TRUE(time2 < time_fence2);
-    ASSERT_TRUE(time3 > time_fence3);
+    ASSERT_LT(time2, time_fence2);
+    ASSERT_GT(time3, time_fence3);
   }
 }
 
@@ -128,8 +134,6 @@ TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) {
       MDRangePolicyFunctor());
 }
 
-// FIXME_HIP
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
 // FIXME_SYCL The number of workgroups on CUDA devices can not be larger than
 // 65535
@@ -145,5 +149,4 @@ TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
       TeamPolicyFunctor(1024));
 #endif
 }
-#endif
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestStackTrace.hpp b/lib/kokkos/core/unit_test/TestStackTrace.hpp
index 284332f3f8..d34d0f92e9 100644
--- a/lib/kokkos/core/unit_test/TestStackTrace.hpp
+++ b/lib/kokkos/core/unit_test/TestStackTrace.hpp
@@ -73,10 +73,10 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       printf("test_f1: %s \n", foutput.c_str());
-      ASSERT_TRUE(std::string::npos != foutput.find("stacktrace_test_f1"));
+      ASSERT_NE(std::string::npos, foutput.find("stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -92,7 +92,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
                   foutput.find("Test::stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -114,7 +114,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
     // TODO make sure stacktrace_test_f2/4 don't show up
@@ -129,7 +129,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("demangled test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
 
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index 97ddfd4cf5..a5e3de85bb 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -137,8 +137,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
 #endif
+    (void)none_auto;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
+    (void)both_auto;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
@@ -147,8 +149,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
 #endif
+    (void)auto_vector;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
         smallest_work, Kokkos::AUTO(), smallest_work);
+    (void)auto_team;
   }
 
   static void test_for(const size_t league_size) {
@@ -970,7 +974,11 @@ struct ClassNoShmemSizeFunction {
                 double *, ExecSpace,
                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
-    int team_size = 8;
+#ifdef KOKKOS_ENABLE_SYCL
+    int team_size = 4;
+#else
+    int team_size      = 8;
+#endif
     if (team_size > ExecSpace::concurrency())
       team_size = ExecSpace::concurrency();
     {
@@ -1115,7 +1123,11 @@ void test_team_mulit_level_scratch_test_lambda() {
       Kokkos::View<double *, ExecSpace,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 4;
+#else
   int team_size = 8;
+#endif
   if (team_size > ExecSpace::concurrency())
     team_size = ExecSpace::concurrency();
 
@@ -1400,7 +1412,7 @@ struct TestTeamBroadcast<
     // above because the functor switches it back.
     bool setValue = ((lid % ts) != tid);
 
-    teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
+    teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
                               lid % ts);
     teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
                               lid % ts);
@@ -1465,7 +1477,7 @@ struct TestTeamBroadcast<
     value_type expected_result = 0;
     for (unsigned int i = 0; i < league_size; i++) {
       value_type val =
-          (value_type((i % team_size) * 3) + off) * (value_type)team_size;
+          (value_type((i % team_size) * 3) + off) * value_type(team_size);
       expected_result += val;
     }
     // For comparison purposes treat the reduction as a random walk in the
diff --git a/lib/kokkos/core/unit_test/TestTeamBasic.hpp b/lib/kokkos/core/unit_test/TestTeamBasic.hpp
index 87c010ac2a..17899f63b1 100644
--- a/lib/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -105,6 +105,75 @@ TEST(TEST_CATEGORY, team_broadcast_long) {
                     long>::test_teambroadcast(1000, 1);
 }
 
+// FIXME_OPENMPTARGET CI fails with
+// Libomptarget error: Copying data from device failed.
+// Possibly, because long_wrapper is not trivially-copyable.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+struct long_wrapper {
+  long value;
+
+  KOKKOS_FUNCTION
+  long_wrapper() : value(0) {}
+
+  KOKKOS_FUNCTION
+  long_wrapper(long val) : value(val) {}
+
+  KOKKOS_FUNCTION
+  friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  friend void operator+=(volatile long_wrapper& lhs,
+                         const volatile long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(const long_wrapper& other) { value = other.value; }
+
+  KOKKOS_FUNCTION
+  void operator=(const volatile long_wrapper& other) volatile {
+    value = other.value;
+  }
+  KOKKOS_FUNCTION
+  operator long() const { return value; }
+};
+}  // namespace Test
+
+namespace Kokkos {
+template <>
+struct reduction_identity<Test::long_wrapper>
+    : public reduction_identity<long> {};
+}  // namespace Kokkos
+
+namespace Test {
+
+// Test for non-arithmetic type
+TEST(TEST_CATEGORY, team_broadcast_long_wrapper) {
+  static_assert(!std::is_arithmetic<long_wrapper>::value, "");
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+}
+#endif
+
 TEST(TEST_CATEGORY, team_broadcast_char) {
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
diff --git a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 3db0eafa33..836134afe0 100644
--- a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -53,14 +53,8 @@ TEST(TEST_CATEGORY, team_reduction_scan) {
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10);
-// FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
-  }
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
 }
 
 TEST(TEST_CATEGORY, team_long_reduce) {
diff --git a/lib/kokkos/core/unit_test/TestTeamScratch.hpp b/lib/kokkos/core/unit_test/TestTeamScratch.hpp
index 75ca358762..bab937273d 100644
--- a/lib/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -54,15 +54,8 @@ TEST(TEST_CATEGORY, team_shared_request) {
 }
 
 TEST(TEST_CATEGORY, team_scratch_request) {
-  // FIXME_HIP the parallel_reduce in this test requires a team size larger than
-  // 256. Fixed in ROCm 3.9
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
@@ -78,21 +71,14 @@ TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); }
 TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); }
 
 TEST(TEST_CATEGORY, multi_level_scratch) {
-  // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a
-  // team size larger than 256. Fixed In ROCm 3.9
   // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and
   // L1 combined per team. Currently OpenMPTarget cannot allocate this high
   // amount of scratch memory.
 #if !defined(KOKKOS_ENABLE_OPENMPTARGET)
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Static> >();
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Dynamic> >();
 #endif
 }
 
diff --git a/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp b/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 992e80397b..f64c5b8809 100644
--- a/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -110,9 +110,9 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
   int team_size_rec_reduce = p.team_size_recommended(
       FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag());
 
-  ASSERT_TRUE(team_size_max_for >= team_size_rec_for);
-  ASSERT_TRUE(team_size_max_reduce >= team_size_rec_reduce);
-  ASSERT_TRUE(team_size_max_for >= team_size_max_reduce);
+  ASSERT_GE(team_size_max_for, team_size_rec_for);
+  ASSERT_GE(team_size_max_reduce, team_size_rec_reduce);
+  ASSERT_GE(team_size_max_for, team_size_max_reduce);
 
   Kokkos::parallel_for(PolicyType(10000, team_size_max_for, 4)
                            .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
@@ -122,13 +122,6 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
                        FunctorFor<T, N, PolicyType, S>());
   MyArray<T, N> val;
   double n_leagues = 10000;
-  // FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (N == 2)
-    n_leagues = 1000;
-  else
-    n_leagues = 500;
-#endif
 
   Kokkos::parallel_reduce(
       PolicyType(n_leagues, team_size_max_reduce, 4)
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
index ba11dc07a9..dbed674756 100644
--- a/lib/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -111,7 +111,7 @@ struct functor_team_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
+              "FAILED team_parallel_for %i %i %lf %lf\n", team.league_rank(),
               team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -321,10 +321,9 @@ struct functor_team_vector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_for %i %i %f %f\n",
+              "FAILED team_vector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
-
           flag() = 1;
         }
       });
@@ -372,7 +371,7 @@ struct functor_team_vector_reduce {
       if (test != value) {
         if (team.league_rank() == 0) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+              "FAILED team_vector_parallel_reduce %i %i %lf %lf %lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value),
               static_cast<unsigned long>(sizeof(Scalar)));
@@ -424,7 +423,7 @@ struct functor_team_vector_reduce_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+            "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
 
@@ -471,8 +470,9 @@ struct functor_vec_single {
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
-          team.team_rank(), (double)value2, (double)value);
+          "FAILED vector_single broadcast %i %i %lf %lf\n", team.league_rank(),
+          team.team_rank(), static_cast<double>(value2),
+          static_cast<double>(value));
 
       flag() = 1;
     }
@@ -523,7 +523,7 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %lf %lf\n",
                                         team.league_rank(), team.team_rank(),
                                         static_cast<double>(test),
                                         static_cast<double>(value));
@@ -560,10 +560,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
-                                      team.league_rank(), team.team_rank(),
-                                      (double)test, (double)value);
-
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED vector_par_reduce %i %i %lf %lf\n", team.league_rank(),
+            team.team_rank(), (double)test, (double)value);
         flag() = 1;
       }
     });
@@ -600,7 +599,7 @@ struct functor_vec_red_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+            "FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), (double)test, (double)value);
 
         flag() = 1;
@@ -630,9 +629,10 @@ struct functor_vec_scan {
 
                               if (test != val) {
                                 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                                    "FAILED vector_par_scan %i %i %f %f\n",
+                                    "FAILED vector_par_scan %i %i %lf %lf\n",
                                     team.league_rank(), team.team_rank(),
-                                    (double)test, (double)val);
+                                    static_cast<double>(test),
+                                    static_cast<double>(val));
 
                                 flag() = 1;
                               }
@@ -723,7 +723,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
@@ -856,7 +861,7 @@ template <typename ScalarType, class DeviceType>
 class TestTripleNestedReduce {
  public:
   using execution_space = DeviceType;
-  using size_type       = typename execution_space::size_type;
+  using size_type = typename execution_space::size_type;
 
   TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
                          const size_type) {}
@@ -1000,17 +1005,24 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
 // GPU) See https://github.com/kokkos/kokkos/issues/1513
+// For Intel GPUs, the requested workgroup size is just too large here.
 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
-#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
   }
+#if defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
+  }
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
 }
 #endif
diff --git a/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp b/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
index 7342ebad84..c4116b9139 100644
--- a/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -280,7 +280,7 @@ struct functor_teamvector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED teamvector_parallel_for %i %i %f %f\n",
+              "FAILED teamvector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -493,7 +493,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index a0bc7c4304..a0d00ded1b 100644
--- a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -138,72 +138,38 @@ struct SumInitJoinFinalValueTypeArray {
   }
 };
 
-template <class Scalar, class ExecutionSpace>
-struct SumWrongInitJoinFinalValueType {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumWrongInitJoinFinalValueType(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(double& val) const { val = double(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
 template <class Scalar, class ExecutionSpace>
 void TestTemplateMetaFunctions() {
-  using type = typename Kokkos::View<Scalar*, ExecutionSpace>;
-  type a("A", 100);
-  /*
-    int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar,
-    ExecutionSpace>, Scalar & >::value; ASSERT_EQ( sum_plain_has_init_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<SumPlain<Scalar, ExecutionSpace>,
+                                         Scalar&>::value == false,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 
-    //int sum_initjoinfinalvaluetypearray_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar,
-    ExecutionSpace>, Scalar[] >::value;
-    //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>>::value ==
+          true,
+      "");
 
-    //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg,
-    sum_initjoinfinalvaluetype_has_init_arg,
-    sum_wronginitjoinfinalvaluetype_has_init_arg );
-
-    int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ( sum_plain_has_join_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_join_arg =
-    Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
-
-    //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg,
-    sum_initjoinfinalvaluetype_has_join_arg,
-    sum_wronginitjoinfinalvaluetype_has_join_arg );
-  */
+  static_assert(Kokkos::Impl::ReduceFunctorHasJoin<
+                    SumPlain<Scalar, ExecutionSpace>>::value == false,
+                "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 }
 
 }  // namespace
diff --git a/lib/kokkos/core/unit_test/TestTypeList.cpp b/lib/kokkos/core/unit_test/TestTypeList.cpp
new file mode 100644
index 0000000000..e450d11562
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTypeList.cpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>
+
+using TypeList2 = Kokkos::Impl::type_list<void, bool>;
+using TypeList3 = Kokkos::Impl::type_list<char, short, int>;
+using TypeList223 =
+    Kokkos::Impl::type_list<void, bool, void, bool, char, short, int>;
+using TypeList223Void   = Kokkos::Impl::type_list<void, void>;
+using TypeList223NoVoid = Kokkos::Impl::type_list<bool, bool, char, short, int>;
+
+// concat_type_list
+using ConcatTypeList2 = Kokkos::Impl::concat_type_list_t<TypeList2>;
+static_assert(std::is_same<TypeList2, ConcatTypeList2>::value,
+              "concat_type_list of a single type_list failed");
+
+using ConcatTypeList223 =
+    Kokkos::Impl::concat_type_list_t<TypeList2, TypeList2, TypeList3>;
+static_assert(std::is_same<TypeList223, ConcatTypeList223>::value,
+              "concat_type_list of three type_lists failed");
+
+// filter_type_list
+using FilterTypeList223Void =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223>;
+static_assert(std::is_same<TypeList223Void, FilterTypeList223Void>::value,
+              "filter_type_list with predicate value==true failed");
+
+using FilterTypeList223NoVoid =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223, false>;
+static_assert(std::is_same<TypeList223NoVoid, FilterTypeList223NoVoid>::value,
+              "filter_type_list with predicate value==false failed");
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
index 570281f9fd..73531e6196 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1060,12 +1060,12 @@ class TestViewAPI {
     dView4 dx, dy, dz;
     hView4 hx, hy, hz;
 
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);
     ASSERT_EQ(dy.extent(0), 0u);
     ASSERT_EQ(dz.extent(0), 0u);
@@ -1116,11 +1116,11 @@ class TestViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1257,19 +1257,19 @@ class TestViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dy = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dz = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
   }
 
   static void run_test_deep_copy_empty() {
@@ -1304,7 +1304,7 @@ class TestViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::View<const DataType, device> &arg_const,
       const Kokkos::View<DataType, device> &arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -1317,8 +1317,8 @@ class TestViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1327,7 +1327,7 @@ class TestViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc; // Setting non-const from const must not compile.
@@ -1440,29 +1440,29 @@ class TestViewAPI {
     const_vector_right_type cvr2 = Kokkos::subview(mv, Kokkos::ALL(), 1);
     const_vector_right_type cvr3 = Kokkos::subview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_vector_type c_cv1(v1);
     typename vector_type::const_type c_cv2(v2);
diff --git a/lib/kokkos/core/unit_test/TestViewAPI_e.hpp b/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
index a5dc6cf29a..d4f484a530 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -54,23 +54,24 @@ namespace Test {
 TEST(TEST_CATEGORY, view_remap) {
   enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA)
 #define EXECSPACE                                                     \
   std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value, \
                    Kokkos::CudaHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#ifdef KOKKOS_ENABLE_HIP
+#elif defined(KOKKOS_ENABLE_HIP)
 #define EXECSPACE                                                     \
   std::conditional<                                                   \
       std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value, \
       Kokkos::Experimental::HIPHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#elif defined(KOKKOS_ENABLE_SYCL)
+#define EXECSPACE                                                      \
+  std::conditional<                                                    \
+      std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value, \
+      Kokkos::Experimental::SYCLHostUSMSpace, TEST_EXECSPACE>::type
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
 #define EXECSPACE Kokkos::HostSpace
 #else
 #define EXECSPACE TEST_EXECSPACE
-#endif
-#endif
 #endif
 
   using output_type =
diff --git a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
index e25cb9e39c..ced0aa3828 100644
--- a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
+++ b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
@@ -96,10 +96,10 @@ TEST(TEST_CATEGORY, view_copy_tests) {
   auto host = Kokkos::DefaultHostExecutionSpace();
 
   constexpr bool DevExecCanAccessHost =
-      Kokkos::Impl::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
-                                       Kokkos::HostSpace>::accessible;
+      Kokkos::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
+                                 Kokkos::HostSpace>::accessible;
 
-  constexpr bool HostExecCanAccessDev = Kokkos::Impl::SpaceAccessibility<
+  constexpr bool HostExecCanAccessDev = Kokkos::SpaceAccessibility<
       typename Kokkos::HostSpace::execution_space,
       typename TEST_EXECSPACE::memory_space>::accessible;
 
diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
index fdbda09917..974d7c98ca 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -768,8 +768,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) data[i] = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
@@ -815,8 +815,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) vr1(i) = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
diff --git a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
index 18db67400d..2a15a84380 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -81,7 +81,7 @@ struct TestViewMappingSubview {
   using DLT  = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>;
   using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DLS1::rank == 1 &&
           std::is_same<typename DLS1::array_layout, Kokkos::LayoutLeft>::value,
@@ -92,7 +92,7 @@ struct TestViewMappingSubview {
   using DRT  = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>;
   using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DRS1::rank == 1 &&
           std::is_same<typename DRS1::array_layout, Kokkos::LayoutRight>::value,
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
index 0125017d93..93eb5476b5 100644
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -184,7 +184,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ONE);
+      ASSERT_EQ(X_h(i, j), ONE);
     }
   }
 
@@ -194,7 +194,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
   }
 
@@ -204,7 +204,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == TWO);
+      ASSERT_EQ(X_h(i, j), TWO);
     }
   }
 
@@ -216,7 +216,7 @@ void test_auto_1d() {
     Kokkos::fence();
     Kokkos::deep_copy(X_h, X);
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
 
     for (size_type jj = 0; jj < numCols; ++jj) {
@@ -226,7 +226,7 @@ void test_auto_1d() {
       Kokkos::fence();
       Kokkos::deep_copy(X_h, X);
       for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i, jj) == ONE);
+        ASSERT_EQ(X_h(i, jj), ONE);
       }
     }
   }
@@ -240,38 +240,38 @@ void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n,
   int col = n > 2 ? 2 : 0;
   int row = m > 2 ? 2 : 0;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     if (a) {
       Kokkos::View<double*, LD, Space> l1da =
           Kokkos::subview(l2d, Kokkos::ALL, row);
-      ASSERT_TRUE(&l1da(0) == &l2d(0, row));
+      ASSERT_EQ(&l1da(0), &l2d(0, row));
       if (n > 1) {
-        ASSERT_TRUE(&l1da(1) == &l2d(1, row));
+        ASSERT_EQ(&l1da(1), &l2d(1, row));
       }
     }
 
     if (b && n > 13) {
       Kokkos::View<double*, LD, Space> l1db =
           Kokkos::subview(l2d, std::pair<unsigned, unsigned>(2, 13), row);
-      ASSERT_TRUE(&l1db(0) == &l2d(2, row));
-      ASSERT_TRUE(&l1db(1) == &l2d(3, row));
+      ASSERT_EQ(&l1db(0), &l2d(2, row));
+      ASSERT_EQ(&l1db(1), &l2d(3, row));
     }
 
     if (c) {
       Kokkos::View<double*, LD, Space> l1dc =
           Kokkos::subview(l2d, col, Kokkos::ALL);
-      ASSERT_TRUE(&l1dc(0) == &l2d(col, 0));
+      ASSERT_EQ(&l1dc(0), &l2d(col, 0));
       if (m > 1) {
-        ASSERT_TRUE(&l1dc(1) == &l2d(col, 1));
+        ASSERT_EQ(&l1dc(1), &l2d(col, 1));
       }
     }
 
     if (d && m > 13) {
       Kokkos::View<double*, LD, Space> l1dd =
           Kokkos::subview(l2d, col, std::pair<unsigned, unsigned>(2, 13));
-      ASSERT_TRUE(&l1dd(0) == &l2d(col, 2));
-      ASSERT_TRUE(&l1dd(1) == &l2d(col, 3));
+      ASSERT_EQ(&l1dd(0), &l2d(col, 2));
+      ASSERT_EQ(&l1dd(1), &l2d(col, 3));
     }
   }
 }
@@ -326,8 +326,8 @@ void test_left_0(bool constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_left_8");
 
     ASSERT_TRUE(x_static_8.span_is_contiguous());
@@ -337,7 +337,7 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x0.span_is_contiguous());
     ASSERT_EQ(x0.span(), 1);
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(constr, x1, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3,
@@ -345,8 +345,8 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x1.span_is_contiguous());
     ASSERT_EQ(x1.span(), 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg1;
     make_subview(constr, x_deg1, x_static_8, Kokkos::pair<int, int>(0, 0), 1, 2,
@@ -369,10 +369,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
@@ -380,10 +380,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(constr, sx4, x_static_8, 0,
@@ -402,9 +402,8 @@ void test_left_0(bool constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -420,8 +419,8 @@ void test_left_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_left_8", 2, 3, 4, 5);
 
     ASSERT_TRUE(x8.span_is_contiguous());
@@ -430,15 +429,15 @@ void test_left_1(bool use_constr) {
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(use_constr, x1, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, 0,
                  1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, Kokkos::pair<int, int>(0, 0), 1, 2, 3,
@@ -461,10 +460,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4,
@@ -477,10 +476,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 0), 2, 3,
@@ -520,8 +519,8 @@ template <class Space>
 void test_left_2() {
   using view_type = Kokkos::View<int****, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x4("x4", 2, 3, 4, 5);
 
     ASSERT_TRUE(x4.span_is_contiguous());
@@ -530,35 +529,35 @@ void test_left_2() {
         Kokkos::subview(x4, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x4(0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x4(0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x4(0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x4(0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x4(1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = Kokkos::subview(
         x4, Kokkos::pair<int, int>(0, 2), 1, Kokkos::pair<int, int>(1, 3), 2);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x4(0, 1, 1, 2));
-    ASSERT_TRUE(&x2(1, 0) == &x4(1, 1, 1, 2));
-    ASSERT_TRUE(&x2(0, 1) == &x4(0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x4(1, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x4(0, 1, 1, 2));
+    ASSERT_EQ(&x2(1, 0), &x4(1, 1, 1, 2));
+    ASSERT_EQ(&x2(0, 1), &x4(0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x4(1, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2 = Kokkos::subview(
         x4, 1, Kokkos::pair<int, int>(0, 2), 2, Kokkos::pair<int, int>(1, 4));
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x4(1, 0, 2, 1));
-    ASSERT_TRUE(&sx2(1, 0) == &x4(1, 1, 2, 1));
-    ASSERT_TRUE(&sx2(0, 1) == &x4(1, 0, 2, 2));
-    ASSERT_TRUE(&sx2(1, 1) == &x4(1, 1, 2, 2));
-    ASSERT_TRUE(&sx2(0, 2) == &x4(1, 0, 2, 3));
-    ASSERT_TRUE(&sx2(1, 2) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x4(1, 0, 2, 1));
+    ASSERT_EQ(&sx2(1, 0), &x4(1, 1, 2, 1));
+    ASSERT_EQ(&sx2(0, 1), &x4(1, 0, 2, 2));
+    ASSERT_EQ(&sx2(1, 1), &x4(1, 1, 2, 2));
+    ASSERT_EQ(&sx2(0, 2), &x4(1, 0, 2, 3));
+    ASSERT_EQ(&sx2(1, 2), &x4(1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(1, 2) /* of [2] */
@@ -586,8 +585,8 @@ template <class Space>
 void test_left_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -595,14 +594,14 @@ void test_left_3() {
     Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(xm, Kokkos::ALL, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(0)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(i, 3));
+      ASSERT_EQ(&x1(i), &xm(i, 3));
     }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 =
@@ -611,7 +610,7 @@ void test_left_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2c =
@@ -620,20 +619,20 @@ void test_left_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2c(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -644,46 +643,46 @@ void test_right_0(bool use_constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_right_8");
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x_static_8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x1.extent(0) == 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(x1.extent(0), 2);
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2;
     make_subview(use_constr, x2, x_static_8, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3), 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x2.extent(0) == 2);
-    ASSERT_TRUE(x2.extent(1) == 2);
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(x2.extent(0), 2);
+    ASSERT_EQ(x2.extent(1), 2);
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
     make_subview(use_constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2),
                  2, 3, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(sx2.extent(0) == 2);
-    ASSERT_TRUE(sx2.extent(1) == 2);
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(sx2.extent(0), 2);
+    ASSERT_EQ(sx2.extent(1), 2);
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(use_constr, sx4, x_static_8, 0,
@@ -696,17 +695,16 @@ void test_right_0(bool use_constr) {
                  2, Kokkos::pair<int, int>(2, 4) /* of [5] */
     );
 
-    ASSERT_TRUE(sx4.extent(0) == 2);
-    ASSERT_TRUE(sx4.extent(1) == 2);
-    ASSERT_TRUE(sx4.extent(2) == 2);
-    ASSERT_TRUE(sx4.extent(3) == 2);
+    ASSERT_EQ(sx4.extent(0), 2);
+    ASSERT_EQ(sx4.extent(1), 2);
+    ASSERT_EQ(sx4.extent(2), 2);
+    ASSERT_EQ(sx4.extent(3), 2);
     for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0)
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -722,21 +720,21 @@ void test_right_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_right_8", 2, 3, 4, 5);
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, 0, 1, 2, 3, 0, 1, 2,
@@ -747,10 +745,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, x2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), 0,
                  1, 2, Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x8(0, 1, 2, 2, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3),
@@ -762,10 +760,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2_deg, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
@@ -803,8 +801,8 @@ template <class Space>
 void test_right_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -813,14 +811,14 @@ void test_right_3() {
         Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1 =
         Kokkos::subview(xm, 3, Kokkos::ALL);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(1)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(3, i));
+      ASSERT_EQ(&x1(i), &xm(3, i));
     }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2c =
@@ -829,7 +827,7 @@ void test_right_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2c(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2 =
@@ -838,20 +836,20 @@ void test_right_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -979,7 +977,7 @@ struct CheckSubviewCorrectness_1D_1D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_1D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1005,7 +1003,7 @@ struct CheckSubviewCorrectness_1D_2D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_2D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1033,7 +1031,7 @@ struct CheckSubviewCorrectness_2D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_2D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1068,7 +1066,7 @@ struct CheckSubviewCorrectness_3D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1107,7 +1105,7 @@ struct CheckSubviewCorrectness_3D_4D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_4D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1165,7 +1163,7 @@ struct CheckSubviewCorrectness_3D_5D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_5D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/unit_test/TestView_64bit.hpp b/lib/kokkos/core/unit_test/TestView_64bit.hpp
index 50626718b5..174a07ac1d 100644
--- a/lib/kokkos/core/unit_test/TestView_64bit.hpp
+++ b/lib/kokkos/core/unit_test/TestView_64bit.hpp
@@ -49,9 +49,9 @@ namespace Test {
 template <class Device>
 void test_64bit() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-  // FIXME_SYCL The SYCL CUDA backend throws an error
+  // We are running out of device memory on Intel GPUs
 #ifdef KOKKOS_ENABLE_SYCL
-  int64_t N = 1000000000;
+  int64_t N = 4000000000;
 #else
   int64_t N = 5000000000;
 #endif
@@ -60,7 +60,7 @@ void test_64bit() {
     Kokkos::parallel_reduce(
         Kokkos::RangePolicy<typename Device::execution_space,
                             Kokkos::IndexType<int64_t>>(0, N),
-        KOKKOS_LAMBDA(const int64_t& /*i*/, int64_t& lsum) { lsum += 1; }, sum);
+        KOKKOS_LAMBDA(const int64_t&, int64_t& lsum) { lsum += 1; }, sum);
     ASSERT_EQ(N, sum);
   }
   {
@@ -111,7 +111,12 @@ void test_64bit() {
     ASSERT_EQ(N0 * N1, sum);
   }
   {
-    int N0    = 1024 * 1024 * 1500;
+// We are running out of device memory on Intel GPUs
+#ifdef KOKKOS_ENABLE_SYCL
+    int64_t N0 = 1024 * 1024 * 900;
+#else
+    int N0 = 1024 * 1024 * 1500;
+#endif
     int64_t P = 1713091;
     Kokkos::View<int*, Device> a("A", N0);
     Kokkos::parallel_for(
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
similarity index 89%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
rename to lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
index 316a2b5d0f..0287829fd6 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
@@ -42,5 +42,12 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
+#ifndef KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+#define KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+
+#include <gtest/gtest.h>
+
+#define TEST_CATEGORY sycl_host_usm
+#define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace
+
+#endif
diff --git a/lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
similarity index 100%
rename from lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
rename to lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index bab29610a3..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index fd227186d5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 669761df97..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index d367fd7e05..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 01b284b2f5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index e15228b1d7..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index 52bbd42f29..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
deleted file mode 100644
index 4aeac8f13f..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
deleted file mode 100644
index e5cb010342..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
deleted file mode 100644
index a52fcb833e..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
deleted file mode 100644
index e345cd9667..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
deleted file mode 100644
index 61547df4f5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
deleted file mode 100644
index 75a769bb94..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
deleted file mode 100644
index 7d09f5c9f3..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
deleted file mode 100644
index ea03f43bd6..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
deleted file mode 100644
index 1f754e8f49..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
deleted file mode 100644
index 4af7057d2a..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index ee7181e118..d09d4edfda 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
   int* p;
-  CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,11 @@ TEST(cuda, raw_cuda_interop) {
   Kokkos::finalize();
 
   offset<<<100, 64>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
   std::array<int, 100> h_p;
   cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +83,6 @@ TEST(cuda, raw_cuda_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  CUDA_SAFE_CALL(cudaFree(p));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 526b985c00..13388b4c54 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -99,12 +99,12 @@ TEST(cuda, raw_cuda_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   cudaStreamDestroy(stream);
 
   int h_p[100];
   cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index 646b379086..2fa61d4312 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -181,37 +181,33 @@ TEST(cuda, space_access) {
   //--------------------------------------
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                        Kokkos::HostSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                                 Kokkos::CudaSpace>::accessible,
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                           Kokkos::CudaUVMSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaUVMSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                            Kokkos::CudaSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                        Kokkos::CudaSpace>::accessible,
-      "");
-
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaUVMSpace>::accessible,
-      "");
-
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
@@ -235,23 +231,23 @@ TEST(cuda, space_access) {
                                             Kokkos::CudaUVMSpace>>::value,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
+                                 Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
@@ -265,8 +261,8 @@ TEST(cuda, space_access) {
 
 TEST(cuda, uvm) {
   if (Kokkos::CudaUVMSpace::available()) {
-    int *uvm_ptr = (int *)Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>(
-        "uvm_ptr", sizeof(int));
+    int *uvm_ptr = static_cast<int *>(
+        Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>("uvm_ptr", sizeof(int)));
 
     *uvm_ptr = 42;
 
diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
index 5dcbe566e2..6d6ff0a67b 100644
--- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -59,17 +59,17 @@ TEST(TEST_CATEGORY, host_space_access) {
   using mirror_space =
       Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space;
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<host_exec_space,
-                                                 Kokkos::HostSpace>::accessible,
+  static_assert(Kokkos::SpaceAccessibility<host_exec_space,
+                                           Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<device_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<mirror_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible,
+      "");
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
index bcd49e69bd..c74090fff9 100644
--- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
+++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -54,12 +54,13 @@
 namespace Test {
 
 TEST(defaultdevicetype, malloc) {
-  int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int));
-  ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int)));
+  int* data = static_cast<int*>(Kokkos::kokkos_malloc(100 * sizeof(int)));
+  ASSERT_NO_THROW(data = static_cast<int*>(
+                      Kokkos::kokkos_realloc(data, 120 * sizeof(int))));
   Kokkos::kokkos_free(data);
 
-  int* data2 = (int*)Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2 == nullptr);
+  int* data2 = static_cast<int*>(Kokkos::kokkos_malloc(0));
+  ASSERT_EQ(data2, nullptr);
   Kokkos::kokkos_free(data2);
 }
 
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
deleted file mode 100644
index 02157836b3..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
deleted file mode 100644
index 80e2fe3f93..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
deleted file mode 100644
index 9694e33ca0..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index 0d773494ac..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index cbbbc810b0..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 444a3e6e95..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index f1f90e7acf..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 5e83121e34..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index c024143d6c..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index dcd6c1dc43..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
index 0a243e0e8e..854f916ba3 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
@@ -66,8 +66,8 @@ struct TestAsyncLauncher {
 
 TEST(hip, async_launcher) {
   size_t *flag;
-  HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
-  HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
   // launch # of cycles * 1000 kernels w/ distinct values
   auto space        = Kokkos::Experimental::HIP();
   auto instance     = space.impl_internal_space_instance();
@@ -80,10 +80,10 @@ TEST(hip, async_launcher) {
   // the sum below should fail
   instance->fence();
   size_t h_flag;
-  HIP_SAFE_CALL(
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice));
   ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2);
-  HIP_SAFE_CALL(hipFree(flag));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(flag));
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
new file mode 100644
index 0000000000..f382e5b568
--- /dev/null
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestHIP_Category.hpp>
+
+namespace Test {
+
+struct TestNone {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const { view(i) = i; }
+
+  TestNone() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+struct TestSpiller {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    size_t array[1000] = {0};
+    // and update flag
+    size_t value = 0;
+    for (int ii = i; ii < 1000; ++ii) {
+      array[ii] = value;
+      value += ii;
+    }
+    for (int ii = i; ii < 1000; ++ii) {
+      value *= array[ii];
+    }
+    Kokkos::atomic_add(&view[0], value);
+  }
+
+  TestSpiller() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+TEST(hip, preferred_blocksize_deduction) {
+  using execution_space =
+      typename Kokkos::Impl::FunctorPolicyExecutionSpace<TestSpiller,
+                                                         void>::execution_space;
+  using policy = Kokkos::RangePolicy<execution_space>;
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestNone, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() == 0);
+  }
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestSpiller, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() > 0);
+  }
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 3a76ca148c..73d08abca9 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // HIP.
 TEST(hip, raw_hip_interop) {
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,12 @@ TEST(hip, raw_hip_interop) {
   Kokkos::finalize();
 
   offset<<<dim3(100), dim3(100), 0, nullptr>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
 
   std::array<int, 100> h_p;
-  HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +84,6 @@ TEST(hip, raw_hip_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  HIP_SAFE_CALL(hipFree(p));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(p));
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 8e0880ddbd..69ca62df6a 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -51,11 +51,11 @@ namespace Test {
 // bound in HIP due to an error when computing the block size.
 TEST(hip, raw_hip_streams) {
   hipStream_t stream;
-  HIP_SAFE_CALL(hipStreamCreate(&stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
   {
@@ -97,12 +97,13 @@ TEST(hip, raw_hip_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
-  HIP_SAFE_CALL(hipStreamDestroy(stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream));
 
   int h_p[100];
-  HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index ae1de8ea2d..d20ea877ec 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -129,27 +129,26 @@ TEST(hip, space_access) {
 
   //--------------------------------------
 
-  static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Experimental::HIP,
-                                        Kokkos::HostSpace>::accessible,
-      "");
-
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Experimental::HIP,
-                    Kokkos::Experimental::HIPSpace>::accessible,
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                            Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                 Kokkos::Experimental::HIPSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Experimental::HIP,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                  Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::HostSpace,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
@@ -166,18 +165,18 @@ TEST(hip, space_access) {
                    Kokkos::Experimental::HIPHostPinnedSpace>::value,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<
                         Kokkos::Experimental::HIPHostPinnedSpace>::Space,
                     Kokkos::HostSpace>::accessible,
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
index db360a99d3..86b2fab3c7 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
@@ -104,7 +104,7 @@ void hip_stream_scratch_test(
   hipStream_t stream[4];
   Kokkos::Experimental::HIP hip[4];
   for (int i = 0; i < K; i++) {
-    HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
     hip[i] = Kokkos::Experimental::HIP(stream[i]);
   }
 // Test that growing scratch size in subsequent calls doesn't crash things
@@ -131,7 +131,7 @@ void hip_stream_scratch_test(
   Kokkos::fence();
   for (int i = 0; i < K; i++) {
     hip[i] = Kokkos::Experimental::HIP();
-    HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
   }
 }
 }  // namespace Impl
diff --git a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
index 419486d7a8..4d5ca46ba6 100644
--- a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
@@ -88,7 +88,7 @@ struct TestIncrExecSpace {
     ExecSpace().fence();
 
     auto concurrency = ExecSpace().concurrency();
-    ASSERT_TRUE(concurrency > 0);
+    ASSERT_GT(concurrency, 0);
 
     int in_parallel = ExecSpace::in_parallel();
     ASSERT_FALSE(in_parallel);
@@ -107,5 +107,7 @@ TEST(TEST_CATEGORY, IncrTest_01_execspace) {
   ASSERT_TRUE(Kokkos::is_execution_space<TEST_EXECSPACE>::value);
   ASSERT_FALSE(Kokkos::is_execution_space<
                TestIncrExecSpaceTypedef<TEST_EXECSPACE>>::value);
+  TestIncrExecSpace<TEST_EXECSPACE> test;
+  test.testit();
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp b/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
index ff4fb6a89f..d40cb4dbe7 100644
--- a/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
@@ -78,7 +78,7 @@ struct TestIncrAtomic {
   }
 };
 
-TEST(TEST_CATEGORY, IncrTest_01_AtomicExchange) {
+TEST(TEST_CATEGORY, IncrTest_02_AtomicExchange) {
   TestIncrAtomic test;
   test.testExchange();
 }
diff --git a/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp b/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
index 4adf9e058f..4192d4abe8 100644
--- a/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
@@ -94,16 +94,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // Device and Host Data structure pointer
   value_type *deviceData, *hostData;
diff --git a/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp b/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
index 5166f5a9f0..6e8fc07b8d 100644
--- a/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
@@ -61,17 +61,17 @@ const int M      = 10;
 template <class ExecSpace>
 struct TestMDRangePolicy {
   // 2D View
-  using View_2D      = typename Kokkos::View<value_type **, ExecSpace>;
+  using View_2D      = Kokkos::View<value_type **, ExecSpace>;
   using Host_View_2D = typename View_2D::HostMirror;
   Host_View_2D hostDataView_2D;
 
   // 3D View
-  using View_3D      = typename Kokkos::View<value_type ***, ExecSpace>;
+  using View_3D      = Kokkos::View<value_type ***, ExecSpace>;
   using Host_View_3D = typename View_3D::HostMirror;
   Host_View_3D hostDataView_3D;
 
   // 4D View
-  using View_4D      = typename Kokkos::View<value_type ****, ExecSpace>;
+  using View_4D      = Kokkos::View<value_type ****, ExecSpace>;
   using Host_View_4D = typename View_4D::HostMirror;
   Host_View_4D hostDataView_4D;
 
@@ -83,16 +83,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // compare and equal
   void compare_equal_2D() {
diff --git a/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index 5bf1860d8e..ab1cd90d4b 100644
--- a/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -74,9 +74,15 @@ struct ThreadScratch {
     for (int i = 0; i < sY; ++i) v_S(i) = 0;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+    // FIXME_SYCL This deadlocks in the subgroup_barrier when running on CUDA
+    // devices.
+#ifdef KOKKOS_ENABLE_SYCL
+      for (int k = 0; k < sY; ++k) v_S(k) += sX * sY * n + sY * m + k;
+#else
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(team, sY),
           [&](const int k) { v_S(k) += sX * sY * n + sY * m + k; });
+#endif
     });
 
     team.team_barrier();
@@ -93,7 +99,7 @@ struct ThreadScratch {
     int scratchSize = scratch_t::shmem_size(sY);
     // So this works with deprecated code enabled:
     policy_t policy =
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
 
     int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
diff --git a/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index b34f652e76..d81822d0da 100644
--- a/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -68,7 +68,7 @@ struct TeamScratch {
 
     Kokkos::parallel_for(
         "Team",
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)),
         KOKKOS_LAMBDA(const team_t &team) {
           // Allocate and use scratch pad memory
@@ -77,11 +77,20 @@ struct TeamScratch {
 
           Kokkos::parallel_for(
               Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+      // FIXME_SYCL This deadlocks in the subgroup_barrier
+      // when running on CUDA devices.
+#ifdef KOKKOS_ENABLE_SYCL
+                for (int k = 0; k < sY; ++k) {
+                  v_S(m, k) =
+                      v_S.extent(0) * v_S.extent(1) * n + v_S.extent(1) * m + k;
+                }
+#else
                 Kokkos::parallel_for(
                     Kokkos::ThreadVectorRange(team, sY), [&](const int k) {
                       v_S(m, k) = v_S.extent(0) * v_S.extent(1) * n +
                                   v_S.extent(1) * m + k;
                     });
+#endif
               });
 
           team.team_barrier();
diff --git a/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index d227e834dc..7d53b9fb20 100644
--- a/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -82,20 +82,20 @@ struct MyComplex {
 template <class ExecSpace>
 struct TestMDRangeReduce {
   // 1D  View of double
-  using View_1D = typename Kokkos::View<value_type*, ExecSpace>;
+  using View_1D = Kokkos::View<value_type*, ExecSpace>;
 
   // 2D  View of double
-  using View_2D = typename Kokkos::View<value_type**, ExecSpace>;
+  using View_2D = Kokkos::View<value_type**, ExecSpace>;
 
   // Index Type for the iterator
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   //  1D - complex View
-  using Complex_View_1D = typename Kokkos::View<MyComplex*, ExecSpace>;
+  using Complex_View_1D = Kokkos::View<MyComplex*, ExecSpace>;
 
   // Reduction when ExecPolicy = MDRangePolicy and ReducerArgument =
   // scalar/1-element view
@@ -176,7 +176,11 @@ struct TestMDRangeReduce {
 TEST(TEST_CATEGORY, incr_14_MDrangeReduce) {
   TestMDRangeReduce<TEST_EXECSPACE> test;
   test.reduce_MDRange();
+// FIXME_OPENMPTARGET: custom reductions are not yet supported in the
+// OpenMPTarget backend.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   test.reduce_custom();
+#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 018855963d..d145d69d9e 100644
--- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,13 +52,16 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
-  sycl::default_selector device_selector;
-  sycl::queue queue(device_selector);
-  constexpr int n = 100;
-  int* p          = sycl::malloc_device<int>(n, queue);
-
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
+
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+  constexpr int n = 100;
+  int* p          = sycl::malloc_device<int>(n, queue);
   {
     TEST_EXECSPACE space(queue);
     Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
new file mode 100644
index 0000000000..91fdaac6e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
+
+namespace Test {
+
+TEST(sycl, space_access) {
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::HostSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      std::is_same<Kokkos::Impl::HostMirror<
+                       Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                   Kokkos::HostSpace>::value,
+      "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Impl::HostMirror<
+              Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<
+                                 Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                             Kokkos::Experimental::SYCLHostUSMSpace>::value,
+                "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+}
+
+TEST(sycl, uvm) {
+  int *uvm_ptr = static_cast<int *>(
+      Kokkos::kokkos_malloc<Kokkos::Experimental::SYCLSharedUSMSpace>(
+          "uvm_ptr", sizeof(int)));
+
+  *uvm_ptr = 42;
+
+  Kokkos::Experimental::SYCL().fence();
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::Experimental::SYCL>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        if (*uvm_ptr == 42) {
+          *uvm_ptr = 2 * 42;
+        }
+      });
+  Kokkos::Experimental::SYCL().fence();
+
+  EXPECT_EQ(*uvm_ptr, int(2 * 42));
+
+  Kokkos::kokkos_free<Kokkos::Experimental::SYCLSharedUSMSpace>(uvm_ptr);
+}
+
+template <class MemSpace, class ExecSpace>
+struct TestViewSYCLAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View<double *, MemSpace>;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagTest &, const int i, long &error_count) const {
+    if (m_base[i] != i + 1) ++error_count;
+  }
+
+  TestViewSYCLAccessible() : m_base("base", N) {}
+
+  static void run() {
+    TestViewSYCLAccessible self;
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N),
+        self);
+    typename MemSpace::execution_space().fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self,
+                            error_count);
+    EXPECT_EQ(error_count, 0);
+  }
+};
+
+TEST(sycl, impl_view_accessible) {
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
similarity index 96%
rename from lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
rename to lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
index 6602d7396a..95a7b68088 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
@@ -1,3 +1,4 @@
+
 /*
 //@HEADER
 // ************************************************************************
@@ -42,5 +43,5 @@
 //@HEADER
 */
 
-#include <TestCudaUVM_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <TestSYCL_Category.hpp>
+#include <TestTaskScheduler.hpp>
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
new file mode 100644
index 0000000000..ab0d09880f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestSYCL_Category.hpp>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+struct SYCLQueueScratchTestFunctor {
+  using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::SYCL>::member_type;
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace,
+               Kokkos::MemoryTraits<Kokkos::Atomic>>
+      counter;
+  int N, M;
+  SYCLQueueScratchTestFunctor(
+      Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter_,
+      int N_, int M_)
+      : counter(counter_), N(N_), M(M_) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const team_t& team) const {
+    scratch_t scr(team.team_scratch(1), M);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                         [&](int i) { scr[i] = 0; });
+    team.team_barrier();
+    for (int i = 0; i < N; i++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                           [&](int j) { scr[j] += 1; });
+    }
+    team.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) {
+      if (scr[i] != N) counter()++;
+    });
+  }
+};
+
+void sycl_queue_scratch_test_one(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter,
+    Kokkos::Experimental::SYCL sycl, int tid) {
+  int M = M_base + tid * 5;
+  Kokkos::TeamPolicy<Kokkos::Experimental::SYCL> p(sycl, T, 64);
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  int bytes = scratch_t::shmem_size(M);
+
+  for (int r = 0; r < 15; r++) {
+    Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)),
+                         SYCLQueueScratchTestFunctor(counter, N, M));
+  }
+}
+
+void sycl_queue_scratch_test(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) {
+  constexpr int K = 4;
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+
+  std::array<Kokkos::Experimental::SYCL, K> sycl;
+  for (int i = 0; i < K; i++) {
+    sycl[i] = Kokkos::Experimental::SYCL(
+        sycl::queue(default_context, device_selector));
+  }
+
+  // Test that growing scratch size in subsequent calls doesn't crash things
+#if defined(KOKKOS_ENABLE_OPENMP)
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    // Limit how many threads submit
+    if (tid < 4) {
+      sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+    }
+  }
+#else
+  for (int tid = 0; tid < K; tid++) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+#endif
+  // Test that if everything is large enough, multiple launches with different
+  // scratch sizes don't step on each other
+  for (int tid = K - 1; tid >= 0; tid--) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+
+  Kokkos::fence();
+}
+}  // namespace Impl
+
+TEST(sycl, team_scratch_1_queues) {
+  int N      = 1000000;
+  int T      = 10;
+  int M_base = 150;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter("C");
+
+  Impl::sycl_queue_scratch_test(N, T, M_base, counter);
+
+  int64_t result;
+  Kokkos::deep_copy(result, counter);
+  ASSERT_EQ(0, result);
+}
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp b/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
new file mode 100644
index 0000000000..870621c1e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
@@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+using ExecSpace  = Kokkos::DefaultHostExecutionSpace;
+using TeamMember = Kokkos::TeamPolicy<ExecSpace>::member_type;
+struct TestTeamFunctor {
+  KOKKOS_FUNCTION void operator()(TeamMember) const {}
+};
+struct TestMDFunctor {
+  KOKKOS_FUNCTION void operator()(const int, const int) const {}
+};
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    Kokkos::TeamPolicy<ExecSpace> teamp(1, Kokkos::AUTO, Kokkos::AUTO);
+    Kokkos::MDRangePolicy<Kokkos::Rank<2>> mdp({0, 0}, {1, 1});
+    Kokkos::Tools::Experimental::TeamSizeTuner team_tune_this(
+        "team_tuner", teamp, TestTeamFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    Kokkos::Tools::Experimental::MDRangeTuner<2> md_tune_this(
+        "md_tuner", mdp, TestMDFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    std::vector<int> options{1, 2, 3, 4, 5};
+
+    auto new_team_tuner = team_tune_this.combine("options", options);
+    auto new_md_tuner   = md_tune_this.combine("options", options);
+    using namespace Kokkos::Tools::Experimental;
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_unbounded;
+    info.type          = ValueType::kokkos_value_string;
+    size_t input       = declare_input_type("kernel", info);
+    VariableValue team_kernel_value = make_variable_value(input, "abs");
+    VariableValue md_kernel_value   = make_variable_value(input, "abs");
+    size_t kernel_context           = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &team_kernel_value);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_md_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int tile_x = std::get<1>(config);
+      int tile_y = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
+                               {0, 0}, {1, 1}, {tile_x, tile_y}),
+                           TestMDFunctor{});
+      new_md_tuner.end();
+    }
+    end_context(kernel_context);
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &md_kernel_value);
+
+    /**
+     * Note that 0.0 is basically a floating point index into
+     * the outermost index in this, which is the options vector
+     * above. The At 0.0, this will be the first element (1).
+     * At 0.9 this will be the last element (5)
+     */
+    auto begin_point = new_team_tuner.get_point(0.0, 0.0, 0.0);
+    assert(std::get<0>(begin_point) == 1);
+    (void)begin_point;  // to avoid warnings in some compilers
+    auto end_point = new_team_tuner.get_point(0.9, 0.0, 0.0);
+    (void)end_point;  // to avoid warnings in some compilers
+    assert(std::get<0>(end_point) == 5);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_team_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int team   = std::get<1>(config);
+      int vector = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::TeamPolicy<ExecSpace>(1, team, vector),
+                           TestTeamFunctor{});
+      new_team_tuner.end();
+    }
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp b/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
new file mode 100644
index 0000000000..2177556d39
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// This file tests the categorical tuner
+
+#include <Kokkos_Core.hpp>
+#include <unistd.h>
+struct point {
+  float x;
+  float y;
+  float z;
+};
+void do_computation(const point& test_point) {
+  usleep(((unsigned int)test_point.x) * 100);
+}
+using namespace Kokkos::Tools::Experimental;
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    VariableInfo info;
+    info.category              = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity         = CandidateValueType::kokkos_value_unbounded;
+    info.type                  = ValueType::kokkos_value_string;
+    size_t input               = declare_input_type("kernel", info);
+    VariableValue kernel_value = make_variable_value(input, "abs");
+    size_t kernel_context      = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &kernel_value);
+
+    std::vector<point> points;
+    points.push_back({1.0, 1.0, 1.0});
+    points.push_back({10.0, 10.0, 10.0});
+    points.push_back({0.0, 0.0, 0.0});
+    auto tuner =
+        Kokkos::Tools::Experimental::make_categorical_tuner("points", points);
+    for (decltype(points)::size_type x = 0; x < 3000; ++x) {
+      point test_point = tuner.begin();
+      do_computation(test_point);
+      tuner.end();
+    }
+
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
similarity index 94%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
rename to lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
index 5eed2ca0d7..ac0b4d2619 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
@@ -42,5 +42,8 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
+#include <iostream>
+#include "Kokkos_Core.hpp"
+
+#include <tools/TestEventCorrectness.hpp>
+#include "../UnitTestMainInit.cpp"
diff --git a/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
new file mode 100644
index 0000000000..430677a335
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <iostream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+#include <impl/Kokkos_Stacktrace.hpp>
+#include <vector>
+#include <algorithm>
+namespace Kokkos {
+class Serial;
+class OpenMP;
+class Cuda;
+class Threads;
+namespace Experimental {
+class SYCL;
+class HIP;
+class OpenMPTarget;
+class HPX;
+}  // namespace Experimental
+}  // namespace Kokkos
+namespace Test {
+struct FencePayload {
+  std::string name;
+  enum distinguishable_devices { yes, no };
+  distinguishable_devices distinguishable;
+  uint32_t dev_id;
+};
+
+std::vector<FencePayload> found_payloads;
+template <typename Lambda>
+void expect_fence_events(std::vector<FencePayload>& expected, Lambda lam) {
+  found_payloads = {};
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  lam();
+  for (auto& entry : expected) {
+    std::cout << "Ref: " << entry.dev_id << std::endl;
+    std::cout << "Ref: " << entry.name << std::endl;
+    auto search = std::find_if(
+        found_payloads.begin(), found_payloads.end(),
+        [&](const auto& found_entry) {
+          auto name_match =
+              (found_entry.name.find(entry.name) != std::string::npos);
+          auto id_match = (entry.dev_id == found_entry.dev_id);
+          std::cout << found_entry.dev_id << std::endl;
+          std::cout << found_entry.name << std::endl;
+          if (!name_match) {
+            std::cout << "Miss on name\n";
+          }
+          if (!id_match) {
+            std::cout << "Miss on id\n";
+          }
+          return (name_match && id_match);
+        });
+    auto found = (search != found_payloads.end());
+    ASSERT_TRUE(found);
+  }
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+}
+
+template <class>
+struct increment {
+  constexpr static const int size = 0;
+};
+int num_instances = 1;
+struct TestFunctor {
+  KOKKOS_FUNCTION void operator()(const int) const {}
+};
+template <typename Lambda>
+void test_wrapper(const Lambda& lambda) {
+  if (!std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value) {
+    lambda();
+  }
+}
+/**
+ * Test that fencing an instance with a name yields a fence
+ * event of that name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_named_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence("named_instance");
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that fencing an instance without a name yields a fence
+ * event of a correct name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_unnamed_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Instance Fence", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+
+/**
+ * Test that invoking a global fence with a name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_named_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"test global fence", FencePayload::distinguishable_devices::no, root}};
+    expect_fence_events(expected,
+                        [=]() { Kokkos::fence("test global fence"); });
+  });
+}
+
+/**
+ * Test that invoking a global fence with no name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_unnamed_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Global Fence", FencePayload::distinguishable_devices::no,
+         root}};
+    expect_fence_events(expected, [=]() { Kokkos::fence(); });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that creating two default instances and fencing both yields
+ * fence on the same device ID, as these should yield the same instance
+ */
+TEST(defaultdevicetype, test_multiple_default_instances) {
+  test_wrapper([&]() {
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex1;
+      Kokkos::DefaultExecutionSpace ex2;
+      ex1.fence("named_instance_fence_one");
+      ex2.fence("named_instance_fence_two");
+    });
+    ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id);
+  });
+}
+
+/**
+ * Test that fencing and kernels yield events on the correct device ID's
+ */
+TEST(defaultdevicetype, test_kernel_sequence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances},
+        {"test_kernel", FencePayload::distinguishable_devices::no,
+         root + num_instances}
+
+    };
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      TestFunctor tf;
+      ex.fence("named_instance");
+      Kokkos::parallel_for(
+          "test_kernel",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(ex, 0, 1), tf);
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+#ifdef KOKKOS_ENABLE_CUDA
+/**
+ * CUDA ONLY: test that creating instances from streams leads to events
+ * on different device ID's
+ */
+TEST(defaultdevicetype, test_streams) {
+  test_wrapper([&]() {
+    // auto root = Kokkos::Tools::Experimental::device_id_root<
+    //    Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      cudaStream_t s1, s2;
+      cudaStreamCreate(&s1);
+      cudaStreamCreate(&s2);
+      Kokkos::Cuda default_space;
+      Kokkos::Cuda space_s1(s1);
+      Kokkos::Cuda space_s2(s2);
+      default_space.fence();
+      space_s1.fence();
+      space_s2.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+    found_payloads.erase(
+        std::remove_if(found_payloads.begin(), found_payloads.end(),
+                       [&](const auto& entry) {
+                         return (
+                             entry.name.find("Fence on space initialization") !=
+                             std::string::npos);
+                       }),
+        found_payloads.end());
+    ASSERT_TRUE(found_payloads[0].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[0].dev_id);
+  });
+}
+
+#endif
+
+}  // namespace Test
diff --git a/lib/kokkos/example/query_device/query_device.cpp b/lib/kokkos/example/query_device/query_device.cpp
index a563b06b28..9c4e9a8c83 100644
--- a/lib/kokkos/example/query_device/query_device.cpp
+++ b/lib/kokkos/example/query_device/query_device.cpp
@@ -47,7 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_MPI)
+//#define USE_MPI
+#if defined(USE_MPI)
 #include <mpi.h>
 #endif
 
@@ -61,7 +62,7 @@ int main(int argc, char** argv) {
 
   (void)argc;
   (void)argv;
-#if defined(KOKKOS_ENABLE_MPI)
+#if defined(USE_MPI)
 
   MPI_Init(&argc, &argv);
 
@@ -72,7 +73,7 @@ int main(int argc, char** argv) {
   msg << "MPI rank(" << mpi_rank << ") ";
 
 #endif
-
+  Kokkos::initialize(argc, argv);
   msg << "{" << std::endl;
 
   if (Kokkos::hwloc::available()) {
@@ -82,15 +83,13 @@ int main(int argc, char** argv) {
         << std::endl;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA)
-  Kokkos::Cuda::print_configuration(msg);
-#endif
+  Kokkos::print_configuration(msg);
 
   msg << "}" << std::endl;
 
   std::cout << msg.str();
-
-#if defined(KOKKOS_ENABLE_MPI)
+  Kokkos::finalize();
+#if defined(USE_MPI)
 
   MPI_Finalize();
 
diff --git a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
index 07b99087d4..5ac7f4fbb0 100644
--- a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -107,8 +107,8 @@ int main(int argc, char* argv[]) {
 
   // ViewType aliases for Rank<2>, Rank<3> for example usage
   using ScalarType  = double;
-  using ViewType_2D = typename Kokkos::View<ScalarType**>;
-  using ViewType_3D = typename Kokkos::View<ScalarType***>;
+  using ViewType_2D = Kokkos::View<ScalarType**>;
+  using ViewType_3D = Kokkos::View<ScalarType***>;
 
   /////////////////////////////////////////////////////////////////////////////
   // Explanation of MDRangePolicy usage, template parameters, constructor
@@ -160,8 +160,7 @@ int main(int argc, char* argv[]) {
   long incorrect_count_2d = 0;
   {
     // Rank<2> Case: Rank is provided, all other parameters are default
-    using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<2> >;
+    using MDPolicyType_2D = Kokkos::MDRangePolicy<Kokkos::Rank<2> >;
 
     // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims
     // defaulted
@@ -185,9 +184,8 @@ int main(int argc, char* argv[]) {
   long incorrect_count_3d = 0;
   {
     // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
-    using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left,
-                                   Kokkos::Experimental::Iterate::Left> >;
+    using MDPolicyType_3D = Kokkos::MDRangePolicy<
+        Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left> >;
 
     // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
     MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}});
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index 597d1e3056..75eca5403f 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 // These two View types are both 2-D arrays of double.  However, they
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index 00bfeea36b..0544e572e7 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
index 20e5c5a284..52af4bd3b5 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -49,7 +49,7 @@
 // the mesh.
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 using mesh_type = Kokkos::View<double***, Kokkos::LayoutRight>;
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 3c0fcd085c..622b24b931 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
index a906ba1447..596b25aaad 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
index c582fa1704..c03515479d 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -46,7 +46,7 @@
 #include <cstdio>
 #include <typeinfo>
 #include <cmath>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 struct FillDevice {
   double value;
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
index 9c5f2d62fc..602122b61f 100644
--- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -45,7 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdlib>
 
 using DefaultHostType = Kokkos::HostSpace::execution_space;
@@ -74,7 +74,7 @@ using DefaultHostType = Kokkos::HostSpace::execution_space;
 template <class GeneratorPool>
 struct generate_random {
   // Output View for the random numbers
-  Kokkos::View<uint64_t*> vals;
+  Kokkos::View<uint64_t**> vals;
 
   // The GeneratorPool
   GeneratorPool rand_pool;
@@ -82,7 +82,7 @@ struct generate_random {
   int samples;
 
   // Initialize all members
-  generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_,
+  generate_random(Kokkos::View<uint64_t**> vals_, GeneratorPool rand_pool_,
                   int samples_)
       : vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
 
@@ -94,8 +94,7 @@ struct generate_random {
     // Draw samples numbers from the pool as urand64 between 0 and
     // rand_pool.MAX_URAND64 Note there are function calls to get other type of
     // scalars, and also to specify Ranges or get a normal distributed float.
-    for (int k = 0; k < samples; k++)
-      vals(i * samples + k) = rand_gen.urand64();
+    for (int k = 0; k < samples; k++) vals(i, k) = rand_gen.urand64();
 
     // Give the state back, which will allow another thread to acquire it
     rand_pool.free_state(rand_gen);
@@ -103,11 +102,11 @@ struct generate_random {
 };
 
 int main(int argc, char* args[]) {
+  Kokkos::initialize(argc, args);
   if (argc != 3) {
     printf("Please pass two integers on the command line\n");
   } else {
     // Initialize Kokkos
-    Kokkos::initialize(argc, args);
     int size    = std::stoi(args[1]);
     int samples = std::stoi(args[2]);
 
@@ -117,7 +116,7 @@ int main(int argc, char* args[]) {
     // pool.
     Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
     Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
-    Kokkos::DualView<uint64_t*> vals("Vals", size * samples);
+    Kokkos::DualView<uint64_t**> vals("Vals", size, samples);
 
     // Run some performance comparisons
     Kokkos::Timer timer;
@@ -151,8 +150,7 @@ int main(int argc, char* args[]) {
            1.0e-9 * samples * size / time_1024);
 
     Kokkos::deep_copy(vals.h_view, vals.d_view);
-
-    Kokkos::finalize();
   }
+  Kokkos::finalize();
   return 0;
 }
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
index d360108925..cc20a497b2 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index c601e0ee16..5e33f59218 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -162,6 +162,7 @@ display_help_text() {
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
       echo "                 VEGA908         = AMD GPU MI100 GFX908"
+      echo "                 VEGA90A         = "
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -478,5 +479,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt
index be8a5e7da5..0c32469da3 100644
--- a/lib/kokkos/master_history.txt
+++ b/lib/kokkos/master_history.txt
@@ -25,3 +25,4 @@ tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
 tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
 tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
+tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
diff --git a/lib/plumed/Install.py b/lib/plumed/Install.py
index 548e51a5bc..5cd3f776cb 100644
--- a/lib/plumed/Install.py
+++ b/lib/plumed/Install.py
@@ -51,9 +51,11 @@ checksums = { \
         '2.6.0' : '204d2edae58d9b10ba3ad460cad64191', \
         '2.6.1' : '89a9a450fc6025299fe16af235957163', \
         '2.6.3' : 'a9f8028fd74528c2024781ea1fdefeee', \
+        '2.6.5' : 'b67356f027e5c2747823b0422c3b0ec2', \
         '2.7.0' : '95f29dd0c067577f11972ff90dfc7d12', \
         '2.7.1' : '4eac6a462ec84dfe0cec96c82421b8e8', \
         '2.7.2' : 'cfa0b4dd90a81c25d3302e8d97bfeaea', \
+        '2.7.3' : 'f00cc82edfefe6bb3df934911dbe32fb', \
         }
 
 # parse and process arguments
diff --git a/potentials/CC.Lebedeva b/potentials/CC.Lebedeva
index bc2db03f6b..bc70500687 100644
--- a/potentials/CC.Lebedeva
+++ b/potentials/CC.Lebedeva
@@ -1,12 +1,13 @@
-# DATE: 2018-11-28  UNITS: metal  CONTRIBUTOR: Zbigniew Koziol softquake@gmail.com CITATION: Z. Koziol et al.: https://arxiv.org/abs/1803.05162
+# DATE: 2021-11-04  UNITS: metal  CONTRIBUTOR: Zbigniew Koziol softquake@gmail.com CITATION: Z. Koziol et al.: https://arxiv.org/abs/1803.05162
 #
-# Lebedeva Potential. https://doi.org/10.1016/j.physe.2011.07.018
+# Lebedeva potential: https://doi.org/10.1039/C0CP02614J and https://doi.org/10.1016/j.physe.2011.07.018
 
 # Parameters must be in this order as here, otherwise their values may be changed.
+# Energies here are given in meV.
 # The last one, S, is convenient for scaling the potential amplitude. S is a multiplication factor for A, B, C
 #        A       B           C       z0     alpha   D1      D2       lambda1   lambda2     S
-# These are values according to Levedeva et al
-#C  C   10.510  11.6523.34  35.883  3.34    4.16   -0.86232 0.10049  0.48703   0.46445     1.0
+# These are values according to Lebedeva et al.: https://doi.org/10.1016/j.cplett.2012.03.082
+C  C   10.510  11.652  29.5  3.34    4.16   -0.86232 0.10049  0.48703   0.46445     1.0
 #
 # These are values by Z. Koziol et al.: https://arxiv.org/abs/1803.05162
-C  C    14.558  21.204      1.8     3.198   4.16   -0.862   0.10049  0.6       0.4         1.0
+C1  C1    14.558  21.204      1.8     3.198   4.16   -0.862   0.10049  0.6       0.4         1.0
diff --git a/potentials/tmd.sw.mod b/potentials/tmd.sw.mod
new file mode 100644
index 0000000000..b88390ee26
--- /dev/null
+++ b/potentials/tmd.sw.mod
@@ -0,0 +1,143 @@
+# DATE: 2018-03-26 UNITS: metal CONTRIBUTOR: Jin-Wu Jiang jwjiang5918@hotmail.com
+# CITATION: J.-W. Jiang, Acta Mech. Solida. Sin 32, 17 (2019).
+# The Stillinger-Weber parameters, for transition-metal dichalcogenide (TMD) lateral heterostructures.
+# M = Mo, W; X = S, Se, Te
+
+# these entries are in LAMMPS "metal" units:
+#   epsilon = eV; sigma = Angstroms
+#   other quantities are unitless
+
+# format of a single entry (one or more lines):
+#   element 1, element 2, element 3, 
+#   epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol
+
+# M-X-X terms
+Mo S  S     1.000   1.252   2.523  67.883   1.000   0.143   6.918   7.223  4  0 0.0
+Mo Se Se    1.000   0.913   3.672  32.526   1.000   0.143   5.737  27.084  4  0 0.0
+Mo Te Te    1.000   0.880   4.097  23.705   1.000   0.143   5.086  40.810  4  0 0.0
+W  S  S     1.000   0.889   3.558  37.687   1.000   0.143   5.664  24.525  4  0 0.0
+W  Se Se    1.000   0.706   4.689  25.607   1.000   0.143   5.476  65.662  4  0 0.0
+W  Te Te    1.000   0.778   4.632  21.313   1.000   0.143   4.326  62.148  4  0 0.0
+# X-M-M terms
+S  Mo Mo    1.000   1.252   2.523  62.449   1.000   0.143   6.918   7.223  4  0 0.0
+S  W  W     1.000   0.889   3.558  33.553   1.000   0.143   5.664  24.525  4  0 0.0
+Se Mo Mo    1.000   0.913   3.672  27.079   1.000   0.143   5.737  27.084  4  0 0.0
+Se W  W     1.000   0.706   4.689  23.218   1.000   0.143   5.476  65.662  4  0 0.0
+Te Mo Mo    1.000   0.880   4.097  20.029   1.000   0.143   5.086  40.810  4  0 0.0
+Te W  W     1.000   0.778   4.632  17.370   1.000   0.143   4.326  62.148  4  0 0.0
+# M-X1-X2 terms
+Mo S  Se    1.000   0.000   0.000  46.989   1.000   0.143   0.000   0.000  4  0 0.0
+Mo S  Te    1.000   0.000   0.000  40.114   1.000   0.143   0.000   0.000  4  0 0.0
+Mo Se S     1.000   0.000   0.000  46.989   1.000   0.143   0.000   0.000  4  0 0.0
+Mo Se Te    1.000   0.000   0.000  27.767   1.000   0.143   0.000   0.000  4  0 0.0
+Mo Te S     1.000   0.000   0.000  40.114   1.000   0.143   0.000   0.000  4  0 0.0
+Mo Te Se    1.000   0.000   0.000  27.767   1.000   0.143   0.000   0.000  4  0 0.0
+W  S  Se    1.000   0.000   0.000  31.065   1.000   0.143   0.000   0.000  4  0 0.0
+W  S  Te    1.000   0.000   0.000  28.341   1.000   0.143   0.000   0.000  4  0 0.0
+W  Se S     1.000   0.000   0.000  31.065   1.000   0.143   0.000   0.000  4  0 0.0
+W  Se Te    1.000   0.000   0.000  23.362   1.000   0.143   0.000   0.000  4  0 0.0
+W  Te S     1.000   0.000   0.000  28.341   1.000   0.143   0.000   0.000  4  0 0.0
+W  Te Se    1.000   0.000   0.000  23.362   1.000   0.143   0.000   0.000  4  0 0.0
+# X-M1-M2 terms
+S  Mo W     1.000   0.000   0.000  45.775   1.000   0.143   0.000   0.000  4  0 0.0
+S  W  Mo    1.000   0.000   0.000  45.775   1.000   0.143   0.000   0.000  4  0 0.0
+Se Mo W     1.000   0.000   0.000  25.074   1.000   0.143   0.000   0.000  4  0 0.0
+Se W  Mo    1.000   0.000   0.000  25.074   1.000   0.143   0.000   0.000  4  0 0.0
+Te Mo W     1.000   0.000   0.000  18.652   1.000   0.143   0.000   0.000  4  0 0.0
+Te W  Mo    1.000   0.000   0.000  18.652   1.000   0.143   0.000   0.000  4  0 0.0
+# zero terms
+Mo Mo Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Mo W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Mo S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Mo Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Mo Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo W  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo W  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo W  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo W  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo W  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo S  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo S  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Se Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Se W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Te Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Mo Te W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Mo Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Mo W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Mo S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Mo Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Mo Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  W  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  W  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  W  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  W  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  W  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  S  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  S  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Se Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Se W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Te Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+W  Te W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Mo S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Mo Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Mo Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  W  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  W  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  W  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  S  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  S  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  S  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  S  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  S  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Se Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Se W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Se S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Se Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Se Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Te Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Te W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Te S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Te Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+S  Te Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Mo S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Mo Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Mo Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se W  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se W  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se W  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se S  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se S  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se S  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se S  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se S  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Se Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Se W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Se S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Se Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Se Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Te Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Te W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Te S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Te Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Se Te Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Mo S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Mo Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Mo Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te W  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te W  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te W  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te S  Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te S  W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te S  S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te S  Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te S  Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Se Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Se W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Se S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Se Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Se Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Te Mo    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Te W     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Te S     0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Te Se    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
+Te Te Te    0.000   1.000   1.000   1.000   1.000   0.143   1.000   1.000  4  0 0.0
diff --git a/src/.gitignore b/src/.gitignore
index 0c6c893234..d604d1857c 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -408,6 +408,8 @@
 /bond_fene.h
 /bond_fene_expand.cpp
 /bond_fene_expand.h
+/bond_fene_nm.cpp
+/bond_fene_nm.h
 /bond_gaussian.cpp
 /bond_gaussian.h
 /bond_gromos.cpp
@@ -442,6 +444,8 @@
 /commgrid.h
 /compute_ackland_atom.cpp
 /compute_ackland_atom.h
+/compute_ave_sphere_atom.cpp
+/compute_ave_sphere_atom.h
 /compute_basal_atom.cpp
 /compute_basal_atom.h
 /compute_body_local.cpp
@@ -1180,6 +1184,8 @@
 /pair_nm_cut_coul_cut.h
 /pair_nm_cut_coul_long.cpp
 /pair_nm_cut_coul_long.h
+/pait_nm_cut_split.cpp
+/pait_nm_cut_split.h
 /pair_oxdna_*.cpp
 /pair_oxdna_*.h
 /pair_oxdna2_*.cpp
@@ -1221,6 +1227,8 @@
 /pair_sph_taitwater_morris.h
 /pair_sw.cpp
 /pair_sw.h
+/pair_sw_mod.cpp
+/pair_sw_mod.h
 /pair_tersoff.cpp
 /pair_tersoff.h
 /pair_tersoff_mod.cpp
diff --git a/src/CLASS2/angle_class2.cpp b/src/CLASS2/angle_class2.cpp
index 82e31440f5..c37c5a8f65 100644
--- a/src/CLASS2/angle_class2.cpp
+++ b/src/CLASS2/angle_class2.cpp
@@ -169,6 +169,8 @@ void AngleClass2::compute(int eflag, int vflag)
 
     // force & energy for bond-angle term
 
+    dr1 = r1 - ba_r1[type];
+    dr2 = r2 - ba_r2[type];
     aa1 = s * dr1 * ba_k1[type];
     aa2 = s * dr2 * ba_k2[type];
 
@@ -459,6 +461,9 @@ double AngleClass2::single(int type, int i1, int i2, int i3)
   double dr2 = r2 - bb_r2[type];
   energy += bb_k[type]*dr1*dr2;
 
+  dr1 = r1 - ba_r1[type];
+  dr2 = r2 - ba_r2[type];
   energy += ba_k1[type]*dr1*dtheta + ba_k2[type]*dr2*dtheta;
+
   return energy;
 }
diff --git a/src/COMPRESS/dump_xyz_gz.cpp b/src/COMPRESS/dump_xyz_gz.cpp
index 9d38c4673c..06561c6d9a 100644
--- a/src/COMPRESS/dump_xyz_gz.cpp
+++ b/src/COMPRESS/dump_xyz_gz.cpp
@@ -88,11 +88,16 @@ void DumpXYZGZ::openfile()
   if (multifile) delete[] filecurrent;
 }
 
+/* ---------------------------------------------------------------------- */
+
 void DumpXYZGZ::write_header(bigint ndump)
 {
   if (me == 0) {
-    std::string header = fmt::format("{}\n", ndump);
-    header += fmt::format("Atoms. Timestep: {}\n", update->ntimestep);
+    auto header = fmt::format("{}\n", ndump);
+    if (time_flag) {
+      double tcurrent = update->atime + (update->ntimestep-update->atimestep) + update->dt;
+      header += fmt::format(" Atoms. Timestep: {} Time: {:.6f}\n", update->ntimestep, tcurrent);
+    } else header += fmt::format(" Atoms. Timestep: {}\n", update->ntimestep);
     writer.write(header.c_str(), header.length());
   }
 }
diff --git a/src/COMPRESS/dump_xyz_zstd.cpp b/src/COMPRESS/dump_xyz_zstd.cpp
index bcbdc08a24..c4748b4ac3 100644
--- a/src/COMPRESS/dump_xyz_zstd.cpp
+++ b/src/COMPRESS/dump_xyz_zstd.cpp
@@ -96,11 +96,16 @@ void DumpXYZZstd::openfile()
   if (multifile) delete[] filecurrent;
 }
 
+/* ---------------------------------------------------------------------- */
+
 void DumpXYZZstd::write_header(bigint ndump)
 {
   if (me == 0) {
-    std::string header = fmt::format("{}\n", ndump);
-    header += fmt::format("Atoms. Timestep: {}\n", update->ntimestep);
+    auto header = fmt::format("{}\n", ndump);
+    if (time_flag) {
+      double tcurrent = update->atime + (update->ntimestep-update->atimestep) + update->dt;
+      header += fmt::format(" Atoms. Timestep: {} Time: {:.6f}\n", update->ntimestep, tcurrent);
+    } else header += fmt::format(" Atoms. Timestep: {}\n", update->ntimestep);
     writer.write(header.c_str(), header.length());
   }
 }
diff --git a/src/DRUDE/compute_temp_drude.cpp b/src/DRUDE/compute_temp_drude.cpp
index 940a7e5328..96344751a7 100644
--- a/src/DRUDE/compute_temp_drude.cpp
+++ b/src/DRUDE/compute_temp_drude.cpp
@@ -89,9 +89,7 @@ void ComputeTempDrude::dof_compute()
   int dim = domain->dimension;
   int *drudetype = fix_drude->drudetype;
 
-  fix_dof = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
+  adjust_dof_fix();
 
   bigint dof_core_loc = 0, dof_drude_loc = 0;
   for (int i = 0; i < nlocal; i++) {
diff --git a/src/DRUDE/compute_temp_drude.h b/src/DRUDE/compute_temp_drude.h
index cef5d8664e..e1cd54edc8 100644
--- a/src/DRUDE/compute_temp_drude.h
+++ b/src/DRUDE/compute_temp_drude.h
@@ -35,7 +35,6 @@ class ComputeTempDrude : public Compute {
   int modify_param(int, char **);
 
  private:
-  int fix_dof;
   class FixDrude *fix_drude;
   char *id_temp;
   class Compute *temperature;
diff --git a/src/Depend.sh b/src/Depend.sh
index af88f24bb4..a8e17e0546 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -77,6 +77,10 @@ if (test $1 = "DPD-BASIC") then
   depend INTEL
 fi
 
+if (test $1 = "EXTRA-COMPUTE") then
+  depend KOKKOS
+fi
+
 if (test $1 = "EXTRA-MOLECULE") then
   depend GPU
   depend OPENMP
diff --git a/src/EXTRA-COMPUTE/compute_ave_sphere_atom.cpp b/src/EXTRA-COMPUTE/compute_ave_sphere_atom.cpp
new file mode 100644
index 0000000000..14a4c364a1
--- /dev/null
+++ b/src/EXTRA-COMPUTE/compute_ave_sphere_atom.cpp
@@ -0,0 +1,278 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_ave_sphere_atom.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "pair.h"
+#include "update.h"
+#include "math_const.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+
+/* ---------------------------------------------------------------------- */
+
+ComputeAveSphereAtom::ComputeAveSphereAtom(LAMMPS *lmp, int narg, char **arg) :
+  Compute(lmp, narg, arg),
+  result(nullptr)
+{
+  if (narg < 3 || narg > 5) error->all(FLERR,"Illegal compute ave/sphere/atom command");
+
+  // process optional args
+
+  cutoff = 0.0;
+
+  int iarg = 3;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"cutoff") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal compute ave/sphere/atom command");
+      cutoff = utils::numeric(FLERR,arg[iarg+1],false,lmp);
+      if (cutoff <= 0.0) error->all(FLERR,"Illegal compute ave/sphere/atom command");
+      iarg += 2;
+    } else error->all(FLERR,"Illegal compute ave/sphere/atom command");
+  }
+
+  peratom_flag = 1;
+  size_peratom_cols = 2;
+  comm_forward = 3;
+
+  nmax = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeAveSphereAtom::~ComputeAveSphereAtom()
+{
+  if (copymode) return;
+
+  memory->destroy(result);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeAveSphereAtom::init()
+{
+  if (!force->pair && cutoff == 0.0)
+    error->all(FLERR,"Compute ave/sphere/atom requires a cutoff be specified "
+               "or a pair style be defined");
+
+  double skin = neighbor->skin;
+  if (cutoff != 0.0) {
+    double cutghost;            // as computed by Neighbor and Comm
+    if (force->pair)
+      cutghost = MAX(force->pair->cutforce+skin,comm->cutghostuser);
+    else
+      cutghost = comm->cutghostuser;
+
+    if (cutoff > cutghost)
+      error->all(FLERR,"Compute ave/sphere/atom cutoff exceeds ghost atom range - "
+                 "use comm_modify cutoff command");
+  }
+
+  int cutflag = 1;
+  if (force->pair) {
+    if (cutoff == 0.0) {
+      cutoff = force->pair->cutforce;
+    }
+    if (cutoff <= force->pair->cutforce+skin) cutflag = 0;
+  }
+
+  cutsq = cutoff*cutoff;
+  sphere_vol = 4.0/3.0*MY_PI*cutsq*cutoff;
+
+  // need an occasional full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->compute = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->occasional = 1;
+  if (cutflag) {
+    neighbor->requests[irequest]->cut = 1;
+    neighbor->requests[irequest]->cutoff = cutoff;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeAveSphereAtom::init_list(int /*id*/, NeighList *ptr)
+{
+  list = ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeAveSphereAtom::compute_peratom()
+{
+  int i,j,ii,jj,inum,jnum;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int count;
+  double vsum[3],vavg[3],vnet[3];
+
+  invoked_peratom = update->ntimestep;
+
+  // grow result array if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(result);
+    nmax = atom->nmax;
+    memory->create(result,nmax,2,"ave/sphere/atom:result");
+    array_atom = result;
+  }
+
+  // need velocities of ghost atoms
+
+  comm->forward_comm_compute(this);
+
+  // invoke full neighbor list (will copy or build if necessary)
+
+  neighbor->build_one(list);
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // compute properties for each atom in group
+  // use full neighbor list to count atoms less than cutoff
+
+  double **x = atom->x;
+  double **v = atom->v;
+  int *mask = atom->mask;
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+
+    if (mask[i] & groupbit) {
+      xtmp = x[i][0];
+      ytmp = x[i][1];
+      ztmp = x[i][2];
+      jlist = firstneigh[i];
+      jnum = numneigh[i];
+
+      // i atom contribution
+
+      count = 1;
+      vsum[0] = v[i][0];
+      vsum[1] = v[i][1];
+      vsum[2] = v[i][2];
+
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+        j &= NEIGHMASK;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        if (rsq < cutsq) {
+          count++;
+          vsum[0] += v[j][0];
+          vsum[1] += v[j][1];
+          vsum[2] += v[j][2];
+        }
+      }
+
+      vavg[0] = vsum[0]/count;
+      vavg[1] = vsum[1]/count;
+      vavg[2] = vsum[2]/count;
+
+      // i atom contribution
+
+      count = 1;
+      vnet[0] = v[i][0] - vavg[0];
+      vnet[1] = v[i][1] - vavg[1];
+      vnet[2] = v[i][2] - vavg[2];
+      double ke_sum = vnet[0]*vnet[0] + vnet[1]*vnet[1] + vnet[2]*vnet[2];
+
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+        j &= NEIGHMASK;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        if (rsq < cutsq) {
+          count++;
+          vnet[0] = v[j][0] - vavg[0];
+          vnet[1] = v[j][1] - vavg[1];
+          vnet[2] = v[j][2] - vavg[2];
+          ke_sum += vnet[0]*vnet[0] + vnet[1]*vnet[1] + vnet[2]*vnet[2];
+        }
+      }
+      double density = count/sphere_vol;
+      double temp = ke_sum/3.0/count;
+      result[i][0] = density;
+      result[i][1] = temp;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int ComputeAveSphereAtom::pack_forward_comm(int n, int *list, double *buf,
+                                        int /*pbc_flag*/, int * /*pbc*/)
+{
+  double **v = atom->v;
+
+  int i,m=0;
+  for (i = 0; i < n; ++i) {
+    buf[m++] = v[list[i]][0];
+    buf[m++] = v[list[i]][1];
+    buf[m++] = v[list[i]][2];
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeAveSphereAtom::unpack_forward_comm(int n, int first, double *buf)
+{
+  double **v = atom->v;
+
+  int i,last,m=0;
+  last = first + n;
+  for (i = first; i < last; ++i) {
+    v[i][0] = buf[m++];
+    v[i][1] = buf[m++];
+    v[i][2] = buf[m++];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based array
+------------------------------------------------------------------------- */
+
+double ComputeAveSphereAtom::memory_usage()
+{
+  double bytes = (double)2*nmax * sizeof(double);
+  return bytes;
+}
diff --git a/src/EXTRA-COMPUTE/compute_ave_sphere_atom.h b/src/EXTRA-COMPUTE/compute_ave_sphere_atom.h
new file mode 100644
index 0000000000..9b5e38750b
--- /dev/null
+++ b/src/EXTRA-COMPUTE/compute_ave_sphere_atom.h
@@ -0,0 +1,67 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(ave/sphere/atom,ComputeAveSphereAtom)
+
+#else
+
+#ifndef LMP_COMPUTE_AVE_SPHERE_ATOM_H
+#define LMP_COMPUTE_AVE_SPHERE_ATOM_H
+
+#include "compute.h"
+
+namespace LAMMPS_NS {
+
+class ComputeAveSphereAtom : public Compute {
+ public:
+  ComputeAveSphereAtom(class LAMMPS *, int, char **);
+  virtual ~ComputeAveSphereAtom();
+  virtual void init();
+  void init_list(int, class NeighList *);
+  virtual void compute_peratom();
+  int pack_forward_comm(int, int *, double *, int, int *);
+  void unpack_forward_comm(int, int, double *);
+  double memory_usage();
+
+ protected:
+  int nmax;
+  double cutoff,cutsq,sphere_vol;
+  class NeighList *list;
+
+  double **result;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Compute ave/sphere/atom requires a cutoff be specified or a pair style be defined
+
+Self-explanatory.
+
+E: Compute ave/sphere/atom cutoff exceeds ghost atom range - use comm_modify cutoff command
+
+Self-explanatory.
+
+*/
diff --git a/src/EXTRA-COMPUTE/compute_temp_rotate.h b/src/EXTRA-COMPUTE/compute_temp_rotate.h
index afa14000b8..af218e756c 100644
--- a/src/EXTRA-COMPUTE/compute_temp_rotate.h
+++ b/src/EXTRA-COMPUTE/compute_temp_rotate.h
@@ -43,7 +43,6 @@ class ComputeTempRotate : public Compute {
   double memory_usage();
 
  private:
-  int fix_dof;
   double tfactor, masstotal;
   double **vbiasall;    // stored velocity bias for all atoms
   int maxbias;          // size of vbiasall array
diff --git a/src/EXTRA-DUMP/dump_dcd.cpp b/src/EXTRA-DUMP/dump_dcd.cpp
index ec3973448a..3aca5e8a98 100644
--- a/src/EXTRA-DUMP/dump_dcd.cpp
+++ b/src/EXTRA-DUMP/dump_dcd.cpp
@@ -100,13 +100,17 @@ void DumpDCD::init_style()
   if (sort_flag == 0 || sortcol != 0)
     error->all(FLERR,"Dump dcd requires sorting by atom ID");
 
-  // check that dump frequency has not changed and is not a variable
-  // but only when not being called from the "write_dump" command.
+  // check that dump modify settings are compatible with dcd
+  // but only when not being called from the "write_dump" command
 
   if (strcmp(id,"WRITE_DUMP") != 0) {
     int idump;
     for (idump = 0; idump < output->ndump; idump++)
       if (strcmp(id,output->dump[idump]->id) == 0) break;
+
+    if (output->mode_dump[idump] == 1)
+      error->all(FLERR,"Cannot use every/time setting for dump dcd");
+
     if (output->every_dump[idump] == 0)
       error->all(FLERR,"Cannot use variable every setting for dump dcd");
 
diff --git a/src/EXTRA-DUMP/dump_xtc.cpp b/src/EXTRA-DUMP/dump_xtc.cpp
index 94846671cd..41b78ab64c 100644
--- a/src/EXTRA-DUMP/dump_xtc.cpp
+++ b/src/EXTRA-DUMP/dump_xtc.cpp
@@ -121,17 +121,24 @@ void DumpXTC::init_style()
 
   if (flush_flag) error->all(FLERR,"Cannot set dump_modify flush for dump xtc");
 
-  // check that dump frequency has not changed and is not a variable
+  // check that dump modify settings are compatible with xtc
+  // but only when not being called from the "write_dump" command
 
-  int idump;
-  for (idump = 0; idump < output->ndump; idump++)
-    if (strcmp(id,output->dump[idump]->id) == 0) break;
-  if (output->every_dump[idump] == 0)
-    error->all(FLERR,"Cannot use variable every setting for dump xtc");
+  if (strcmp(id,"WRITE_DUMP") != 0) {
+    int idump;
+    for (idump = 0; idump < output->ndump; idump++)
+      if (strcmp(id,output->dump[idump]->id) == 0) break;
 
-  if (nevery_save == 0) nevery_save = output->every_dump[idump];
-  else if (nevery_save != output->every_dump[idump])
-    error->all(FLERR,"Cannot change dump_modify every for dump xtc");
+    if (output->mode_dump[idump] == 1)
+      error->all(FLERR,"Cannot use every/time setting for dump xtc");
+
+    if (output->every_dump[idump] == 0)
+      error->all(FLERR,"Cannot use every variable setting for dump xtc");
+
+    if (nevery_save == 0) nevery_save = output->every_dump[idump];
+    else if (nevery_save != output->every_dump[idump])
+      error->all(FLERR,"Cannot change dump_modify every for dump xtc");
+  }
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/EXTRA-FIX/fix_ave_correlate_long.cpp b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
index 1746c7f6f5..5dc3f01aad 100644
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@@ -70,6 +70,7 @@ FixAveCorrelateLong::FixAveCorrelateLong(LAMMPS * lmp, int narg, char **arg):
 
   restart_global = 1;
   global_freq = nfreq;
+  time_depend = 1;
 
   // parse values until one isn't recognized
 
@@ -400,11 +401,8 @@ void FixAveCorrelateLong::end_of_step()
   double scalar;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/correlate/long");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/EXTRA-MOLECULE/bond_fene_nm.cpp b/src/EXTRA-MOLECULE/bond_fene_nm.cpp
new file mode 100644
index 0000000000..147a63512a
--- /dev/null
+++ b/src/EXTRA-MOLECULE/bond_fene_nm.cpp
@@ -0,0 +1,280 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "bond_fene_nm.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondFENENM::BondFENENM(LAMMPS *lmp) : BondFENE(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+BondFENENM::~BondFENENM()
+{
+  if (allocated && !copymode) {
+    memory->destroy(nn);
+    memory->destroy(mm);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENENM::compute(int eflag, int vflag)
+{
+  int i1, i2, n, type;
+  double delx, dely, delz, ebond, fbond;
+  double rsq, r0sq, rlogarg, sr6;
+  double r;
+
+  ebond = sr6 = 0.0;
+  ev_init(eflag, vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int **bondlist = neighbor->bondlist;
+  int nbondlist = neighbor->nbondlist;
+  int nlocal = atom->nlocal;
+  int newton_bond = force->newton_bond;
+
+  for (n = 0; n < nbondlist; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    // force from log term
+
+    rsq = delx * delx + dely * dely + delz * delz;
+    r0sq = r0[type] * r0[type];
+    rlogarg = 1.0 - rsq / r0sq;
+
+    // if r -> r0, then rlogarg < 0.0 which is an error
+    // issue a warning and reset rlogarg = epsilon
+    // if r > 2*r0 something serious is wrong, abort
+    // change cutuff from .1 to .02 so only bond lengths > 1.485 give the warning
+    // and crash the run if rlogarg < -.21 rather than < -3
+    // Don't print out warnings, only errors
+    if (rlogarg < .02) {
+      error->warning(FLERR, "fene/nm/split bond too long: {} {} {} {}", update->ntimestep,
+                     atom->tag[i1], atom->tag[i2], sqrt(rsq));
+      if (rlogarg <= -.21) error->one(FLERR, "Bad FENE bond");
+      rlogarg = 0.02;
+    }
+
+    fbond = -k[type] / rlogarg;
+    // force from n-m term
+    if (rsq < sigma[type]*sigma[type]) {
+      r = sqrt(rsq);
+      fbond += epsilon[type] * (nn[type] * mm[type] / (nn[type] - mm[type])) *
+          (pow(sigma[type] / r, nn[type]) - pow(sigma[type] / r, mm[type])) / rsq;
+    }
+
+    // energy
+
+    if (eflag) {
+      ebond = -0.5 * k[type] * r0sq * log(rlogarg);
+      if (rsq < sigma[type]*sigma[type])
+        ebond += (epsilon[type] / (nn[type] - mm[type])) *
+            (mm[type] * pow(sigma[type] / r, nn[type]) - nn[type] * pow(sigma[type] / r, mm[type]));
+    }
+    // apply force to each of 2 atoms
+    if (newton_bond || i1 < nlocal) {
+      f[i1][0] += delx * fbond;
+      f[i1][1] += dely * fbond;
+      f[i1][2] += delz * fbond;
+    }
+
+    if (newton_bond || i2 < nlocal) {
+      f[i2][0] -= delx * fbond;
+      f[i2][1] -= dely * fbond;
+      f[i2][2] -= delz * fbond;
+    }
+
+    if (evflag) ev_tally(i1, i2, nlocal, newton_bond, ebond, fbond, delx, dely, delz);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENENM::allocate()
+{
+  BondFENE::allocate();
+  int n = atom->nbondtypes + 1;
+  memory->create(nn, n, "bond:nn");
+  memory->create(mm, n, "bond:mm");
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one type
+------------------------------------------------------------------------- */
+
+void BondFENENM::coeff(int narg, char **arg)
+{
+  if (narg != 7) error->all(FLERR, "Incorrect args for bond coefficients");
+  if (!allocated) allocate();
+
+  int ilo, ihi;
+  utils::bounds(FLERR, arg[0], 1, atom->nbondtypes, ilo, ihi, error);
+
+  double k_one = utils::numeric(FLERR, arg[1], false, lmp);
+  double r0_one = utils::numeric(FLERR, arg[2], false, lmp);
+  double epsilon_one = utils::numeric(FLERR, arg[3], false, lmp);
+  double sigma_one = utils::numeric(FLERR, arg[4], false, lmp);
+  double nn_one = utils::numeric(FLERR, arg[5], false, lmp);
+  double mm_one = utils::numeric(FLERR, arg[6], false, lmp);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    k[i] = k_one;
+    r0[i] = r0_one;
+    epsilon[i] = epsilon_one;
+    sigma[i] = sigma_one;
+    nn[i] = nn_one;
+    mm[i] = mm_one;
+    setflag[i] = 1;
+    count++;
+  }
+
+  if (count == 0) error->all(FLERR, "Incorrect args for bond coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   check if special_bond settings are valid
+------------------------------------------------------------------------- */
+
+void BondFENENM::init_style()
+{
+  // special bonds should be 0 1 1
+
+  if (force->special_lj[1] != 0.0 || force->special_lj[2] != 1.0 || force->special_lj[3] != 1.0) {
+    if (comm->me == 0) error->warning(FLERR, "Use special bonds = 0,1,1 with bond style fene");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double BondFENENM::equilibrium_distance(int i)
+{
+  return 0.97 * sigma[i];
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void BondFENENM::write_restart(FILE *fp)
+{
+  fwrite(&k[1], sizeof(double), atom->nbondtypes, fp);
+  fwrite(&r0[1], sizeof(double), atom->nbondtypes, fp);
+  fwrite(&epsilon[1], sizeof(double), atom->nbondtypes, fp);
+  fwrite(&sigma[1], sizeof(double), atom->nbondtypes, fp);
+  fwrite(&nn[1], sizeof(double), atom->nbondtypes, fp);
+  fwrite(&mm[1], sizeof(double), atom->nbondtypes, fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void BondFENENM::read_restart(FILE *fp)
+{
+  allocate();
+
+  if (comm->me == 0) {
+    utils::sfread(FLERR, &k[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+    utils::sfread(FLERR, &r0[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+    utils::sfread(FLERR, &epsilon[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+    utils::sfread(FLERR, &sigma[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+    utils::sfread(FLERR, &nn[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+    utils::sfread(FLERR, &mm[1], sizeof(double), atom->nbondtypes, fp, nullptr, error);
+  }
+  MPI_Bcast(&k[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+  MPI_Bcast(&r0[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+  MPI_Bcast(&epsilon[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+  MPI_Bcast(&sigma[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+  MPI_Bcast(&nn[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+  MPI_Bcast(&mm[1], atom->nbondtypes, MPI_DOUBLE, 0, world);
+
+  for (int i = 1; i <= atom->nbondtypes; i++) setflag[i] = 1;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to data file
+------------------------------------------------------------------------- */
+
+void BondFENENM::write_data(FILE *fp)
+{
+  for (int i = 1; i <= atom->nbondtypes; i++)
+    fprintf(fp, "%d %g %g %g %g %g %g\n", i, k[i], r0[i], epsilon[i], sigma[i], nn[i], mm[i]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double BondFENENM::single(int type, double rsq, int /*i*/, int /*j*/, double &fforce)
+{
+  double r0sq = r0[type] * r0[type];
+  double rlogarg = 1.0 - rsq / r0sq;
+  double r;
+  // if r -> r0, then rlogarg < 0.0 which is an error
+  // issue a warning and reset rlogarg = epsilon
+  // if r > 2*r0 something serious is wrong, abort
+
+  // change cutuff from .1 to .02 so only bond lengths > 1.485 give the warning
+  // and crash the run if rlogarg < -.21 rather than < -3
+  // Don't print out warnings, only errors
+  if (rlogarg < 0.02) {
+    error->warning(FLERR, "FENE bond too long: {} {:.8}", update->ntimestep, sqrt(rsq));
+    if (rlogarg <= -.21) error->one(FLERR, "Bad FENE bond");
+    rlogarg = 0.02;
+  }
+
+  double eng = -0.5 * k[type] * r0sq * log(rlogarg);
+  fforce = -k[type] / rlogarg;
+
+  if (rsq < sigma[type]*sigma[type]) {
+    r = sqrt(rsq);
+    fforce += epsilon[type] * (nn[type] * mm[type] / (nn[type] - mm[type])) *
+        (pow(sigma[type] / r, nn[type]) - pow(sigma[type] / r, mm[type])) / rsq;
+    eng += (epsilon[type] / (nn[type] - mm[type])) *
+        (mm[type] * pow(sigma[type] / r, nn[type]) - nn[type] * pow(sigma[type] / r, mm[type]));
+  }
+
+  return eng;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *BondFENENM::extract(const char *str, int &dim)
+{
+  dim = 1;
+  if (strcmp(str, "kappa") == 0) return (void *) k;
+  if (strcmp(str, "r0") == 0) return (void *) r0;
+  return nullptr;
+}
diff --git a/src/EXTRA-MOLECULE/bond_fene_nm.h b/src/EXTRA-MOLECULE/bond_fene_nm.h
new file mode 100644
index 0000000000..f00394c6d8
--- /dev/null
+++ b/src/EXTRA-MOLECULE/bond_fene_nm.h
@@ -0,0 +1,76 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+// clang-format off
+BondStyle(fene/nm,BondFENENM);
+// clang-format on
+#else
+
+#ifndef LMP_BOND_FENE_NM_H
+#define LMP_BOND_FENE_NM_H
+
+#include "bond_fene.h"
+
+namespace LAMMPS_NS {
+class BondFENENM : public BondFENE {
+ public:
+  BondFENENM(class LAMMPS *);
+  virtual ~BondFENENM();
+  virtual void compute(int, int);
+  virtual void coeff(int, char **);
+  void init_style();
+  double equilibrium_distance(int);
+  virtual void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_data(FILE *);
+  double single(int, double, int, int, double &);
+  virtual void *extract(const char *, int &);
+
+ protected:
+  double TWO_1_3;
+  double *nn, *mm;
+
+  virtual void allocate();
+};
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+W: FENE bond too long: %ld %d %d %g
+
+A FENE bond has stretched dangerously far.  It's interaction strength
+will be truncated to attempt to prevent the bond from blowing up.
+
+E: Bad FENE bond
+
+Two atoms in a FENE bond have become so far apart that the bond cannot
+be computed.
+
+E: Incorrect args for bond coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+W: Use special bonds = 0,1,1 with bond style fene
+
+Most FENE models need this setting for the special_bonds command.
+
+W: FENE bond too long: %ld %g
+
+A FENE bond has stretched dangerously far.  It's interaction strength
+will be truncated to attempt to prevent the bond from blowing up.
+
+*/
diff --git a/src/EXTRA-MOLECULE/dihedral_table_cut.cpp b/src/EXTRA-MOLECULE/dihedral_table_cut.cpp
index b132104bdd..522eff8626 100644
--- a/src/EXTRA-MOLECULE/dihedral_table_cut.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_table_cut.cpp
@@ -82,20 +82,14 @@ enum { //GSL status return codes.
   GSL_EBADLEN  = 19
 };
 
-
-
 // cyc_splintD(): Evaluate the deriviative of a cyclic spline at position x,
 //           with n control points at xa[], ya[], with parameters y2a[].
 //           The xa[] must be monotonically increasing and their
 //           range should not exceed period (ie xa[n-1] < xa[0] + period).
 //           x must lie in the range:  [(xa[n-1]-period), (xa[0]+period)]
 //           "period" is typically 2*PI.
-static double cyc_splintD(double const *xa,
-                          double const *ya,
-                          double const *y2a,
-                          int n,
-                          double period,
-                          double x)
+static double cyc_splintD(double const *xa, double const *ya, double const *y2a,
+                          int n, double period, double x)
 {
   int klo = -1;
   int khi = n; // (not n-1)
@@ -490,8 +484,7 @@ void DihedralTableCut::coeff(int narg, char **arg)
   if (tb->ninput < 2)
     error->all(FLERR,"Invalid dihedral table length: {}",arg[5]);
   else if ((tb->ninput == 2) && (tabstyle == SPLINE))
-    error->all(FLERR,"Invalid dihedral spline table length: {} "
-                                 "(Try linear)",arg[5]);
+    error->all(FLERR,"Invalid dihedral spline table length: {} (Try linear)",arg[5]);
 
   // check for monotonicity
   for (int i=0; i < tb->ninput-1; i++) {
@@ -509,12 +502,10 @@ void DihedralTableCut::coeff(int narg, char **arg)
   double phihi = tb->phifile[tb->ninput-1];
   if (tb->use_degrees) {
     if ((phihi - philo) >= 360)
-      error->all(FLERR,"Dihedral table angle range must be < 360 "
-                                   "degrees ({})",arg[5]);
+      error->all(FLERR,"Dihedral table angle range must be < 360 degrees ({})",arg[5]);
   } else {
     if ((phihi - philo) >= MY_2PI)
-      error->all(FLERR,"Dihedral table angle range must be < 2*PI "
-                                   "radians ({})",arg[5]);
+      error->all(FLERR,"Dihedral table angle range must be < 2*PI radians ({})",arg[5]);
   }
 
   // convert phi from degrees to radians
@@ -532,9 +523,9 @@ void DihedralTableCut::coeff(int narg, char **arg)
   // We also want the angles to be sorted in increasing order.
   // This messy code fixes these problems with the user's data:
   {
-    double *phifile_tmp = new double [tb->ninput];  //temporary arrays
-    double *ffile_tmp = new double [tb->ninput];  //used for sorting
-    double *efile_tmp = new double [tb->ninput];
+    double *phifile_tmp = new double[tb->ninput];  //temporary arrays
+    double *ffile_tmp = new double[tb->ninput];  //used for sorting
+    double *efile_tmp = new double[tb->ninput];
 
     // After re-imaging, does the range of angles cross the 0 or 2*PI boundary?
     // If so, find the discontinuity:
diff --git a/src/EXTRA-PAIR/pair_nm_cut_split.cpp b/src/EXTRA-PAIR/pair_nm_cut_split.cpp
new file mode 100644
index 0000000000..ca6feb1549
--- /dev/null
+++ b/src/EXTRA-PAIR/pair_nm_cut_split.cpp
@@ -0,0 +1,160 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing Author: Julien Devemy (ICCF), Robert S. Hoy (USF), Joseph D. Dietz (USF)
+------------------------------------------------------------------------- */
+
+#include "pair_nm_cut_split.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "neigh_list.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using MathSpecial::powint;
+
+/* ---------------------------------------------------------------------- */
+PairNMCutSplit::PairNMCutSplit(LAMMPS *lmp) : PairNMCut(lmp)
+{
+  writedata = 1;
+}
+
+void PairNMCutSplit::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,factor_lj;
+  double r,forcenm,rminv,rninv;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+  ev_init(eflag,vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        // r < r0 --> use generalized LJ
+        if (rsq < r0[itype][jtype]*r0[itype][jtype]) {
+          forcenm = e0nm[itype][jtype]*nm[itype][jtype]*
+            (r0n[itype][jtype]/pow(r,nn[itype][jtype])
+             -r0m[itype][jtype]/pow(r,mm[itype][jtype]));
+        }
+        // r > r0 --> use standard LJ (m = 6 n = 12)
+        else forcenm =(e0[itype][jtype]/6.0)*72.0*(4.0/powint(r,12)-2.0/powint(r,6));
+
+        fpair = factor_lj*forcenm*r2inv;
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (eflag) {
+          // r < r0 --> use generalized LJ
+          if (rsq < r0[itype][jtype]*r0[itype][jtype]) {
+            rminv = pow(r2inv,mm[itype][jtype]/2.0);
+            rninv = pow(r2inv,nn[itype][jtype]/2.0);
+
+            evdwl = e0nm[itype][jtype]*(mm[itype][jtype]*r0n[itype][jtype]*rninv -
+                                        nn[itype][jtype]*r0m[itype][jtype]*rminv) -
+              offset[itype][jtype];
+          }
+          // r > r0 --> use standard LJ (m = 6 n = 12)
+          else evdwl = (e0[itype][jtype]/6.0)*(24.0*powint(r2inv,6) - 24.0*pow(r2inv,3));
+          evdwl *= factor_lj;
+        }
+        if (evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairNMCutSplit::single(int /*i*/, int /*j*/, int itype, int jtype, double rsq, double /*factor_coul*/, double factor_lj, double &fforce)
+{
+  double r2inv,rminv,rninv,r,forcenm,phinm;
+
+  r2inv = 1.0/rsq;
+  r = sqrt(rsq);
+  rminv = pow(r2inv,mm[itype][jtype]/2.0);
+  rninv = pow(r2inv,nn[itype][jtype]/2.0);
+  // r < r_0, use generalized LJ
+  if (rsq < r0[itype][jtype]*r0[itype][jtype]) {  // note the addition of the r0 factor
+     forcenm = e0nm[itype][jtype]*nm[itype][jtype]*
+      (r0n[itype][jtype]/pow(r,nn[itype][jtype])-r0m[itype][jtype]/pow(r,mm[itype][jtype]));
+      phinm = e0nm[itype][jtype]*(mm[itype][jtype]*r0n[itype][jtype]*rninv
+      -nn[itype][jtype]*r0m[itype][jtype]*rminv)-offset[itype][jtype];
+
+  }
+  // r > r_0 --> use standard LJ (m = 6 n = 12)
+  else {
+    forcenm = (e0[itype][jtype]/6.0)*72.0*(4.0/powint(r,12)-2.0/powint(r,6));
+    phinm = (e0[itype][jtype]/6.0)*(24.0*powint(r2inv,6)-24.0*powint(r2inv,3));
+  }
+
+  fforce = factor_lj*forcenm*r2inv;
+  return factor_lj*phinm;
+}
+
diff --git a/src/EXTRA-PAIR/pair_nm_cut_split.h b/src/EXTRA-PAIR/pair_nm_cut_split.h
new file mode 100644
index 0000000000..a05634ada2
--- /dev/null
+++ b/src/EXTRA-PAIR/pair_nm_cut_split.h
@@ -0,0 +1,53 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(nm/cut/split,PairNMCutSplit);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_NM_CUT_SPLIT_H
+#define LMP_PAIR_NM_CUT_SPLIT_H
+
+#include "pair_nm_cut.h"
+namespace LAMMPS_NS {
+
+class PairNMCutSplit : public PairNMCut {
+ public:
+  PairNMCutSplit(class LAMMPS *);
+  double single(int, int, int, int, double, double, double, double &);
+  virtual void compute(int, int);
+};
+}    // namespace LAMMPS_NS
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+*/
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index a87d2165d9..f13e87d317 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -114,7 +114,7 @@ action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
 action pair_lj_cut_gpu.cpp
 action pair_lj_cut_gpu.h
 action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
-action pair_lj_smooth_gpu.h pair_lj_smooth.h
+action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
 action pair_lj_expand_gpu.cpp
 action pair_lj_expand_gpu.h
 action pair_lj_expand_coul_long_gpu.cpp pair_lj_expand_coul_long.cpp
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 7180a04873..808d435016 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -206,14 +206,16 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   #endif
 
   // set newton pair flag
-  // require newtonflag = 0 since currently required by all GPU pair styles
-
-  if (newtonflag == 1) error->all(FLERR,"Illegal package gpu command");
 
   force->newton_pair = newtonflag;
   if (force->newton_pair || force->newton_bond) force->newton = 1;
   else force->newton = 0;
 
+  // require newton pair off if _particle_split < 1
+
+  if (force->newton_pair == 1 && _particle_split < 1)
+    error->all(FLERR,"Cannot use newton pair on for split less than 1 for now");
+
   if (pair_only_flag) {
     lmp->suffixp = lmp->suffix;
     lmp->suffix = nullptr;
@@ -335,7 +337,6 @@ void FixGPU::post_force(int /* vflag */)
   force->pair->virial[4] += lvirial[4];
   force->pair->virial[5] += lvirial[5];
 
-  if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute();
   timer->stamp(Timer::PAIR);
 }
 
diff --git a/src/GPU/fix_nve_gpu.cpp b/src/GPU/fix_nve_gpu.cpp
index 6612b8f65d..9392953398 100644
--- a/src/GPU/fix_nve_gpu.cpp
+++ b/src/GPU/fix_nve_gpu.cpp
@@ -37,7 +37,7 @@ using namespace FixConst;
 FixNVEGPU::FixNVEGPU(LAMMPS *lmp, int narg, char **arg) :
   FixNVE(lmp, narg, arg)
 {
-  _dtfm = 0;
+  _dtfm = nullptr;
   _nlocal_max = 0;
 }
 
@@ -57,7 +57,11 @@ void FixNVEGPU::setup(int vflag)
     _respa_on = 1;
   else
     _respa_on = 0;
-  if (atom->ntypes > 1) reset_dt();
+
+  // ensure that _dtfm array is initialized if the group is not "all"
+  // or there is more than one atom type as that re-ordeted array is used for
+  // per-type/per-atom masses and group membership detection.
+  if ((igroup != 0) || (atom->ntypes > 1)) reset_dt();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp
index e3dfda428f..a9695a0b73 100644
--- a/src/GPU/pair_beck_gpu.cpp
+++ b/src/GPU/pair_beck_gpu.cpp
@@ -132,8 +132,6 @@ void PairBeckGPU::compute(int eflag, int vflag)
 
 void PairBeckGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style beck/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp
index 219f8a81eb..c358aa0945 100644
--- a/src/GPU/pair_born_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp
@@ -157,8 +157,6 @@ void PairBornCoulLongCSGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style born/coul/long/cs/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style born/coul/long/cs/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp
index 6ae3eda6e2..bb4359696f 100644
--- a/src/GPU/pair_born_coul_long_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_gpu.cpp
@@ -152,8 +152,6 @@ void PairBornCoulLongGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style born/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style born/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
index 178fc180d4..f7d8ed5faa 100644
--- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
@@ -147,8 +147,6 @@ void PairBornCoulWolfCSGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style born/coul/wolf/cs/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style born/coul/wolf/cs/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp
index 474161a1ed..17a5165e04 100644
--- a/src/GPU/pair_born_coul_wolf_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_gpu.cpp
@@ -145,8 +145,6 @@ void PairBornCoulWolfGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style born/coul/wolf/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style born/coul/wolf/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp
index e7f4cd8bda..436dfef6fb 100644
--- a/src/GPU/pair_born_gpu.cpp
+++ b/src/GPU/pair_born_gpu.cpp
@@ -134,8 +134,6 @@ void PairBornGPU::compute(int eflag, int vflag)
 
 void PairBornGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style born/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp
index d23907896d..3c3ae40370 100644
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@@ -139,8 +139,6 @@ void PairBuckCoulCutGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style buck/coul/cut/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style buck/coul/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp
index dfa3467a2f..96ac39adae 100644
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@@ -148,8 +148,6 @@ void PairBuckCoulLongGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style buck/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style buck/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp
index fa1e842743..7af0455a84 100644
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@@ -132,8 +132,6 @@ void PairBuckGPU::compute(int eflag, int vflag)
 
 void PairBuckGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style buck/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp
index 250c56a2f3..e3e266f487 100644
--- a/src/GPU/pair_colloid_gpu.cpp
+++ b/src/GPU/pair_colloid_gpu.cpp
@@ -133,8 +133,6 @@ void PairColloidGPU::compute(int eflag, int vflag)
 
 void PairColloidGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style colloid/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp
index 001eaee01c..a0a6add94f 100644
--- a/src/GPU/pair_coul_cut_gpu.cpp
+++ b/src/GPU/pair_coul_cut_gpu.cpp
@@ -137,8 +137,6 @@ void PairCoulCutGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style coul/cut/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style coul/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp
index a28b1115f7..2684a2fd22 100644
--- a/src/GPU/pair_coul_debye_gpu.cpp
+++ b/src/GPU/pair_coul_debye_gpu.cpp
@@ -138,8 +138,6 @@ void PairCoulDebyeGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style coul/debye/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style coul/debye/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp
index 47e3443f2e..b9e64674aa 100644
--- a/src/GPU/pair_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_coul_dsf_gpu.cpp
@@ -149,8 +149,6 @@ void PairCoulDSFGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style coul/dsf/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style coul/dsf/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp
index 84df90915b..1a57cada93 100644
--- a/src/GPU/pair_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_coul_long_cs_gpu.cpp
@@ -152,8 +152,6 @@ void PairCoulLongCSGPU::init_style()
 
   if (!atom->q_flag)
     error->all(FLERR,"Pair style coul/long/cs/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style coul/long/cs/gpu requires newton pair off");
 
   // Call init_one calculation make sure scale is correct
   for (int i = 1; i <= atom->ntypes; i++) {
diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp
index 54bd3299fa..28351ca0c1 100644
--- a/src/GPU/pair_coul_long_gpu.cpp
+++ b/src/GPU/pair_coul_long_gpu.cpp
@@ -147,8 +147,6 @@ void PairCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all(FLERR,"Pair style coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style coul/long/gpu requires newton pair off");
 
   // Call init_one calculation make sure scale is correct
   for (int i = 1; i <= atom->ntypes; i++) {
diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp
index e17cbd91e9..bd1227123b 100644
--- a/src/GPU/pair_dpd_gpu.cpp
+++ b/src/GPU/pair_dpd_gpu.cpp
@@ -279,8 +279,6 @@ void PairDPDGPU::compute(int eflag, int vflag)
 
 void PairDPDGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style dpd/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp
index 12390f5d68..c1d07c42a5 100644
--- a/src/GPU/pair_dpd_tstat_gpu.cpp
+++ b/src/GPU/pair_dpd_tstat_gpu.cpp
@@ -298,8 +298,6 @@ void PairDPDTstatGPU::compute(int eflag, int vflag)
 
 void PairDPDTstatGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style dpd/tstat/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp
index fcfbc69dc7..ae2e74c1e9 100644
--- a/src/GPU/pair_eam_alloy_gpu.cpp
+++ b/src/GPU/pair_eam_alloy_gpu.cpp
@@ -157,8 +157,6 @@ void PairEAMAlloyGPU::compute(int eflag, int vflag)
 
 void PairEAMAlloyGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style eam/alloy/gpu requires newton pair off");
 
   // convert read-in file(s) to arrays and spline them
 
diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp
index c84fd328d6..4875f4236e 100644
--- a/src/GPU/pair_eam_fs_gpu.cpp
+++ b/src/GPU/pair_eam_fs_gpu.cpp
@@ -156,8 +156,6 @@ void PairEAMFSGPU::compute(int eflag, int vflag)
 
 void PairEAMFSGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style eam/fs/gpu requires newton pair off");
 
   // convert read-in file(s) to arrays and spline them
 
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index f12262278c..ae5812c0c0 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -155,9 +154,6 @@ void PairEAMGPU::compute(int eflag, int vflag)
 
 void PairEAMGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style eam/gpu requires newton pair off");
-
   // convert read-in file(s) to arrays and spline them
 
   file2array();
diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp
index d1ddb1939d..e06050b2fd 100644
--- a/src/GPU/pair_gauss_gpu.cpp
+++ b/src/GPU/pair_gauss_gpu.cpp
@@ -130,8 +130,6 @@ void PairGaussGPU::compute(int eflag, int vflag)
 
 void PairGaussGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style gauss/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index b17bb2f240..1a2c67337f 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -160,8 +160,6 @@ void PairGayBerneGPU::init_style()
   avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
   if (!avec)
     error->all(FLERR,"Pair gayberne/gpu requires atom style ellipsoid");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style gayberne/gpu requires newton pair off");
   if (!atom->ellipsoid_flag)
     error->all(FLERR,"Pair gayberne/gpu requires atom style ellipsoid");
 
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index 546c31a94e..ab3cb15b2a 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -131,8 +131,6 @@ void PairLJ96CutGPU::init_style()
 {
   cut_respa = nullptr;
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj96/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
index 75c38b0a42..6b51933128 100644
--- a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
@@ -129,8 +129,6 @@ void PairLJCharmmCoulCharmmGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR, "Pair style lj/charmm/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR, "Pair style lj/charmm/coul/long/gpu requires newton pair off");
 
   // Repeated cutsq calculation in init_one() is required for GPU package
 
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index e3670cc9b8..861ef0c36d 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -152,8 +152,6 @@ void PairLJCharmmCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/charmm/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/charmm/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
 
diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
index 73e0c32e17..d1258dde57 100644
--- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
@@ -147,8 +147,6 @@ void PairLJClass2CoulLongGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/class2/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/class2/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp
index 91d18b1ea1..82edfa0809 100644
--- a/src/GPU/pair_lj_class2_gpu.cpp
+++ b/src/GPU/pair_lj_class2_gpu.cpp
@@ -129,8 +129,6 @@ void PairLJClass2GPU::compute(int eflag, int vflag)
 
 void PairLJClass2GPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/class2/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp
index 2d117a05b2..7ec121a6c6 100644
--- a/src/GPU/pair_lj_cubic_gpu.cpp
+++ b/src/GPU/pair_lj_cubic_gpu.cpp
@@ -135,8 +135,6 @@ void PairLJCubicGPU::compute(int eflag, int vflag)
 
 void PairLJCubicGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cubic/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index d49568d49c..62740c98b5 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -139,8 +139,6 @@ void PairLJCutCoulCutGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/cut/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
index 23a95611e6..24db5ac68c 100644
--- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
@@ -141,8 +141,6 @@ void PairLJCutCoulDebyeGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/debye/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/debye/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
index 1af26869f9..b6bc14de7d 100644
--- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
@@ -150,8 +150,6 @@ void PairLJCutCoulDSFGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/dsf/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/dsf/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index a6439cf9cc..c70944229d 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -152,8 +152,6 @@ void PairLJCutCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
index 035d218047..8038db1424 100644
--- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
@@ -144,8 +144,6 @@ void PairLJCutCoulMSMGPU::init_style()
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/cut/gpu requires atom attribute q");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/msm/gpu requires newton pair off");
 
   if (force->kspace->scalar_pressure_flag)
     error->all(FLERR,"Must use 'kspace_modify pressure/scalar no' with GPU MSM Pair styles");
diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
index 05916af72f..d2f9cbb720 100644
--- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
@@ -143,8 +143,6 @@ void PairLJCutDipoleCutGPU::init_style()
   if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag)
     error->all(FLERR,"Pair dipole/cut/gpu requires atom attributes q, mu, torque");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style dipole/cut/gpu requires newton pair off");
 
   if (strcmp(update->unit_style,"electron") == 0)
     error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
index a9839cd086..c9830a8627 100644
--- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
@@ -155,8 +155,6 @@ void PairLJCutDipoleLongGPU::init_style()
   if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag)
     error->all(FLERR,"Pair dipole/cut/gpu requires atom attributes q, mu, torque");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style dipole/cut/gpu requires newton pair off");
 
   if (strcmp(update->unit_style,"electron") == 0)
     error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 21df58c64d..ec04cb969d 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -135,8 +135,6 @@ void PairLJCutGPU::init_style()
 {
   cut_respa = nullptr;
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
index c37ccb9231..cfb2f06768 100644
--- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
@@ -168,8 +168,6 @@ void PairLJCutTIP4PLongGPU::init_style()
     error->all(FLERR,"Pair style lj/cut/tip4p/long/gpu requires atom IDs");
   if (!atom->q_flag)
     error->all(FLERR, "Pair style lj/cut/tip4p/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/tip4p/long/gpu requires newton pair off");
   if (force->bond == nullptr)
     error->all(FLERR,"Must use a bond style with TIP4P potential");
   if (force->angle == nullptr)
diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
index df37aea219..068012227e 100644
--- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
@@ -152,8 +152,6 @@ void PairLJExpandCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/cut/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/cut/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
index b53eb833a0..e2dae0870c 100644
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -132,8 +132,6 @@ void PairLJExpandGPU::compute(int eflag, int vflag)
 
 void PairLJExpandGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/expand/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp
index 6d78833642..cdccb51725 100644
--- a/src/GPU/pair_lj_gromacs_gpu.cpp
+++ b/src/GPU/pair_lj_gromacs_gpu.cpp
@@ -134,8 +134,6 @@ void PairLJGromacsGPU::compute(int eflag, int vflag)
 
 void PairLJGromacsGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/gromacs/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
index 28d1a44fa8..998cba05a0 100644
--- a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
@@ -156,8 +156,6 @@ void PairLJSDKCoulLongGPU::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR,"Pair style lj/sdk/coul/long/gpu requires atom attribute q");
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/sdk/coul/long/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_sdk_gpu.cpp b/src/GPU/pair_lj_sdk_gpu.cpp
index 938ee83e4a..66cfe53009 100644
--- a/src/GPU/pair_lj_sdk_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_gpu.cpp
@@ -137,8 +137,6 @@ void PairLJSDKGPU::compute(int eflag, int vflag)
 
 void PairLJSDKGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style lj/sdk/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
index 0bb0e66d92..f387dc8715 100644
--- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
+++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
@@ -142,8 +142,6 @@ void PairLJSFDipoleSFGPU::init_style()
   if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag)
     error->all(FLERR,"Pair dipole/sf/gpu requires atom attributes q, mu, torque");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style dipole/sf/gpu requires newton pair off");
 
   if (strcmp(update->unit_style,"electron") == 0)
     error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp
index 7edf6aa01f..3b438b909e 100644
--- a/src/GPU/pair_lj_smooth_gpu.cpp
+++ b/src/GPU/pair_lj_smooth_gpu.cpp
@@ -130,8 +130,6 @@ void PairLJSmoothGPU::init_style()
 {
   //cut_respa = nullptr;
 
-  if (force->newton_pair) error->all(FLERR, "Pair style lj/smooth/gpu requires newton pair off");
-
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp
index a059607880..df5438e095 100644
--- a/src/GPU/pair_mie_cut_gpu.cpp
+++ b/src/GPU/pair_mie_cut_gpu.cpp
@@ -132,8 +132,6 @@ void PairMIECutGPU::init_style()
 {
   cut_respa = nullptr;
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style mie/cut/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
index 641e5e8654..d7522d1a08 100644
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -128,8 +128,6 @@ void PairMorseGPU::compute(int eflag, int vflag)
 
 void PairMorseGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style morse/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp
index 34a02c69b0..e43025f014 100644
--- a/src/GPU/pair_resquared_gpu.cpp
+++ b/src/GPU/pair_resquared_gpu.cpp
@@ -159,8 +159,6 @@ void PairRESquaredGPU::compute(int eflag, int vflag)
 
 void PairRESquaredGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style resquared/gpu requires newton pair off");
   if (!atom->ellipsoid_flag)
     error->all(FLERR,"Pair resquared/gpu requires atom style ellipsoid");
 
diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp
index 654e2e603b..b2e491bd83 100644
--- a/src/GPU/pair_soft_gpu.cpp
+++ b/src/GPU/pair_soft_gpu.cpp
@@ -133,8 +133,6 @@ void PairSoftGPU::compute(int eflag, int vflag)
 
 void PairSoftGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style soft/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp
index d4b322c0a0..69cafd2972 100644
--- a/src/GPU/pair_sw_gpu.cpp
+++ b/src/GPU/pair_sw_gpu.cpp
@@ -148,8 +148,6 @@ void PairSWGPU::init_style()
 
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style sw/gpu requires atom IDs");
-  if (force->newton_pair != 0)
-    error->all(FLERR,"Pair style sw/gpu requires newton pair off");
 
   double **c1, **c2, **c3, **c4, **c5, **c6;
   double **ncutsq, **ncut, **sigma, **powerp, **powerq, **sigma_gamma;
diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp
index ad1a8bfef9..7a7881993d 100644
--- a/src/GPU/pair_table_gpu.cpp
+++ b/src/GPU/pair_table_gpu.cpp
@@ -132,8 +132,6 @@ void PairTableGPU::compute(int eflag, int vflag)
 
 void PairTableGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style table/gpu requires newton pair off");
 
   int ntypes = atom->ntypes;
 
diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp
index 7b3f953840..891b5051e6 100644
--- a/src/GPU/pair_tersoff_gpu.cpp
+++ b/src/GPU/pair_tersoff_gpu.cpp
@@ -154,8 +154,6 @@ void PairTersoffGPU::init_style()
 
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style tersoff/gpu requires atom IDs");
-  if (force->newton_pair != 0)
-    error->all(FLERR,"Pair style tersoff/gpu requires newton pair off");
 
   double *lam1, *lam2, *lam3, *powermint;
   double *biga, *bigb, *bigr, *bigd;
diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp
index 4f3f769d89..703ce54ad1 100644
--- a/src/GPU/pair_tersoff_mod_gpu.cpp
+++ b/src/GPU/pair_tersoff_mod_gpu.cpp
@@ -148,8 +148,6 @@ void PairTersoffMODGPU::init_style()
 
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style tersoff/mod/gpu requires atom IDs");
-  if (force->newton_pair != 0)
-    error->all(FLERR,"Pair style tersoff/mod/gpu requires newton pair off");
 
   double *lam1, *lam2, *lam3, *powermint;
   double *biga, *bigb, *bigr, *bigd;
diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp
index 7139e07f7d..a272815f94 100644
--- a/src/GPU/pair_tersoff_zbl_gpu.cpp
+++ b/src/GPU/pair_tersoff_zbl_gpu.cpp
@@ -155,8 +155,6 @@ void PairTersoffZBLGPU::init_style()
 
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style tersoff/zbl/gpu requires atom IDs");
-  if (force->newton_pair != 0)
-    error->all(FLERR,"Pair style tersoff/zbl/gpu requires newton pair off");
 
   double *lam1, *lam2, *lam3, *powermint;
   double *biga, *bigb, *bigr, *bigd;
diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp
index 71908dd557..c7483c737b 100644
--- a/src/GPU/pair_ufm_gpu.cpp
+++ b/src/GPU/pair_ufm_gpu.cpp
@@ -136,8 +136,6 @@ void PairUFMGPU::init_style()
 {
 //  cut_respa = nullptr;
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style ufm/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp
index 277ac036cd..cc3ed487e4 100644
--- a/src/GPU/pair_vashishta_gpu.cpp
+++ b/src/GPU/pair_vashishta_gpu.cpp
@@ -156,8 +156,6 @@ void PairVashishtaGPU::init_style()
 
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style vashishta/gpu requires atom IDs");
-  if (force->newton_pair != 0)
-    error->all(FLERR,"Pair style vashishta/gpu requires newton pair off");
 
   double *cutsq, *r0, *gamma, *eta;
   double *lam1inv, *lam4inv, *zizj, *mbigd;
diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp
index b3f9d2ecfb..a31148b352 100644
--- a/src/GPU/pair_yukawa_colloid_gpu.cpp
+++ b/src/GPU/pair_yukawa_colloid_gpu.cpp
@@ -141,8 +141,6 @@ void PairYukawaColloidGPU::init_style()
   if (!atom->sphere_flag)
     error->all(FLERR,"Pair yukawa/colloid/gpu requires atom style sphere");
 
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style yukawa/colloid/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp
index d8c586e270..267052b0e2 100644
--- a/src/GPU/pair_yukawa_gpu.cpp
+++ b/src/GPU/pair_yukawa_gpu.cpp
@@ -130,8 +130,6 @@ void PairYukawaGPU::compute(int eflag, int vflag)
 
 void PairYukawaGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style yukawa/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp
index 7cdc51d225..28a69c2ddd 100644
--- a/src/GPU/pair_zbl_gpu.cpp
+++ b/src/GPU/pair_zbl_gpu.cpp
@@ -132,8 +132,6 @@ void PairZBLGPU::compute(int eflag, int vflag)
 
 void PairZBLGPU::init_style()
 {
-  if (force->newton_pair)
-    error->all(FLERR,"Pair style zbl/gpu requires newton pair off");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
diff --git a/src/H5MD/dump_h5md.cpp b/src/H5MD/dump_h5md.cpp
index bc9c98caa0..a59de9773c 100644
--- a/src/H5MD/dump_h5md.cpp
+++ b/src/H5MD/dump_h5md.cpp
@@ -181,6 +181,7 @@ DumpH5MD::DumpH5MD(LAMMPS *lmp, int narg, char **arg) : Dump(lmp, narg, arg)
   // allocate global array for atom coords
 
   bigint n = group->count(igroup);
+  if ((bigint) domain->dimension*n > MAXSMALLINT) error->all(FLERR,"Too many atoms for dump h5md");
   natoms = static_cast<int> (n);
 
   if (every_position>=0)
diff --git a/src/H5MD/dump_h5md.h b/src/H5MD/dump_h5md.h
index 28b74649c6..5e3f3f8279 100644
--- a/src/H5MD/dump_h5md.h
+++ b/src/H5MD/dump_h5md.h
@@ -90,6 +90,11 @@ E: Dump h5md requires sorting by atom ID
 
 Use the dump_modify sort command to enable this.
 
+E: Too many atoms for dump h5md
+
+The system size must fit in a 32-bit integer to use this dump
+style.
+
 E: Cannot use variable every setting for dump xtc
 
 The format of this file requires snapshots at regular intervals.
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 04bf84ed31..45fa0654a9 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -88,6 +88,8 @@ action comm_kokkos.cpp
 action comm_kokkos.h
 action comm_tiled_kokkos.cpp
 action comm_tiled_kokkos.h
+action compute_ave_sphere_atom_kokkos.cpp compute_ave_sphere_atom.cpp
+action compute_ave_sphere_atom_kokkos.h compute_ave_sphere_atom.h
 action compute_coord_atom_kokkos.cpp
 action compute_coord_atom_kokkos.h
 action compute_orientorder_atom_kokkos.cpp
diff --git a/src/KOKKOS/angle_class2_kokkos.cpp b/src/KOKKOS/angle_class2_kokkos.cpp
index 146e8b40ea..2a386f6489 100644
--- a/src/KOKKOS/angle_class2_kokkos.cpp
+++ b/src/KOKKOS/angle_class2_kokkos.cpp
@@ -224,8 +224,8 @@ void AngleClass2Kokkos<DeviceType>::operator()(TagAngleClass2Compute<NEWTON_BOND
 
   // force & energy for bond-bond term
 
-  const F_FLOAT dr1 = r1 - d_bb_r1[type];
-  const F_FLOAT dr2 = r2 - d_bb_r2[type];
+  F_FLOAT dr1 = r1 - d_bb_r1[type];
+  F_FLOAT dr2 = r2 - d_bb_r2[type];
   const F_FLOAT tk1 = d_bb_k[type] * dr1;
   const F_FLOAT tk2 = d_bb_k[type] * dr2;
 
@@ -241,6 +241,8 @@ void AngleClass2Kokkos<DeviceType>::operator()(TagAngleClass2Compute<NEWTON_BOND
 
   // force & energy for bond-angle term
 
+  dr1 = r1 - d_ba_r1[type];
+  dr2 = r2 - d_ba_r2[type];
   const F_FLOAT aa1 = s * dr1 * d_ba_k1[type];
   const F_FLOAT aa2 = s * dr2 * d_ba_k2[type];
 
diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
index c713427aa7..18de4d46cb 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -1630,7 +1630,7 @@ void AtomVecAngleKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecAngleKokkos::data_atom(double *coord, imageint imagetmp,
-                                  char **values)
+                                  const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1663,9 +1663,10 @@ void AtomVecAngleKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecAngleKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecAngleKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                         int offset)
 {
-  h_molecule(nlocal) = utils::inumeric(FLERR,values[0],true,lmp);
+  h_molecule(nlocal) = utils::inumeric(FLERR,values[offset],true,lmp);
   h_num_bond(nlocal) = 0;
   h_num_angle(nlocal) = 0;
   return 1;
diff --git a/src/KOKKOS/atom_vec_angle_kokkos.h b/src/KOKKOS/atom_vec_angle_kokkos.h
index 9de0c043eb..9bc7753889 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.h
+++ b/src/KOKKOS/atom_vec_angle_kokkos.h
@@ -51,8 +51,8 @@ class AtomVecAngleKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index eec2486763..891ebb51c2 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -821,8 +821,8 @@ void AtomVecAtomicKokkos::create_atom(int itype, double *coord)
    initialize other atom quantities
 ------------------------------------------------------------------------- */
 
-void AtomVecAtomicKokkos::data_atom(double *coord, tagint imagetmp,
-                                    char **values)
+void AtomVecAtomicKokkos::data_atom(double *coord, imageint imagetmp,
+                                    const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h
index 231c95cd3b..1197bc04e7 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.h
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.h
@@ -44,7 +44,7 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
   void pack_data(double **);
   void write_data(FILE *, int, double **);
   double memory_usage();
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index 7582a01730..3655d894c9 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -1056,7 +1056,7 @@ void AtomVecBondKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecBondKokkos::data_atom(double *coord, imageint imagetmp,
-                                  char **values)
+                                  const std::vector<std::string> &values)
 {
   int nlocal = atomKK->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1088,9 +1088,10 @@ void AtomVecBondKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecBondKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecBondKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                        int offset)
 {
-  h_molecule(nlocal) = utils::inumeric(FLERR,values[0],true,lmp);
+  h_molecule(nlocal) = utils::inumeric(FLERR,values[offset],true,lmp);
   h_num_bond(nlocal) = 0;
   return 1;
 }
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h
index 350aeef2d7..a9bc5a1092 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.h
+++ b/src/KOKKOS/atom_vec_bond_kokkos.h
@@ -45,8 +45,8 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 3bb0985afe..7de36ffd5d 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -955,7 +955,7 @@ void AtomVecChargeKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecChargeKokkos::data_atom(double *coord, imageint imagetmp,
-                                    char **values)
+                                    const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -987,9 +987,10 @@ void AtomVecChargeKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecChargeKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecChargeKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                          int offset)
 {
-  h_q[nlocal] = utils::numeric(FLERR,values[0],true,lmp);
+  h_q[nlocal] = utils::numeric(FLERR,values[offset],true,lmp);
 
   return 1;
 }
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h
index 2e1ba97e8d..866b836350 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.h
+++ b/src/KOKKOS/atom_vec_charge_kokkos.h
@@ -46,8 +46,8 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int , char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int , const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index e02631f89e..cf7ad9d533 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -1716,8 +1716,8 @@ void AtomVecDPDKokkos::create_atom(int itype, double *coord)
    initialize other atom quantities
 ------------------------------------------------------------------------- */
 
-void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
-                                    char **values)
+void AtomVecDPDKokkos::data_atom(double *coord, imageint imagetmp,
+                                 const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1759,9 +1759,10 @@ void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                       int offset)
 {
-  h_dpdTheta(nlocal) = utils::numeric(FLERR,values[0],true,lmp);
+  h_dpdTheta(nlocal) = utils::numeric(FLERR,values[offset],true,lmp);
 
   atomKK->modified(Host,DPDTHETA_MASK);
 
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
index c7d523cb34..2168fa630c 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.h
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -54,8 +54,8 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index cd5316cc73..b75c33e046 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -1488,7 +1488,7 @@ void AtomVecFullKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecFullKokkos::data_atom(double *coord, imageint imagetmp,
-                                       char **values)
+                                  const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1525,10 +1525,11 @@ void AtomVecFullKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecFullKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecFullKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                        int offset)
 {
-  h_molecule(nlocal) = utils::inumeric(FLERR,values[0],true,lmp);
-  h_q(nlocal) = utils::numeric(FLERR,values[1],true,lmp);
+  h_molecule(nlocal) = utils::inumeric(FLERR,values[offset],true,lmp);
+  h_q(nlocal) = utils::numeric(FLERR,values[offset+1],true,lmp);
   h_num_bond(nlocal) = 0;
   h_num_angle(nlocal) = 0;
   h_num_dihedral(nlocal) = 0;
diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h
index c751eb840d..b5ce032c4f 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.h
+++ b/src/KOKKOS/atom_vec_full_kokkos.h
@@ -45,8 +45,8 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
index 6b9f6852ad..22ce80478c 100644
--- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@@ -970,7 +970,8 @@ void AtomVecHybridKokkos::create_atom(int itype, double *coord)
    grow() occurs here so arrays for all sub-styles are grown
 ------------------------------------------------------------------------- */
 
-void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
+void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp,
+                                    const std::vector<std::string> &values)
 {
   atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
 
@@ -1009,7 +1010,7 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
 
   int m = 5;
   for (int k = 0; k < nstyles; k++)
-    m += styles[k]->data_atom_hybrid(nlocal,&values[m]);
+    m += styles[k]->data_atom_hybrid(nlocal,values,m);
 
   atom->nlocal++;
 }
@@ -1018,21 +1019,21 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
    unpack one line from Velocities section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVecHybridKokkos::data_vel(int m, char **values)
+void AtomVecHybridKokkos::data_vel(int m, const std::vector<std::string> &values)
 {
   atomKK->sync(Host,V_MASK);
 
-  h_v(m,0) = utils::numeric(FLERR,values[0],true,lmp);
-  h_v(m,1) = utils::numeric(FLERR,values[1],true,lmp);
-  h_v(m,2) = utils::numeric(FLERR,values[2],true,lmp);
+  int ivalue = 1;
+  h_v(m,0) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_v(m,1) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_v(m,2) = utils::numeric(FLERR,values[ivalue++],true,lmp);
 
   atomKK->modified(Host,V_MASK);
 
   // each sub-style parses sub-style specific values
 
-  int n = 3;
   for (int k = 0; k < nstyles; k++)
-    n += styles[k]->data_vel_hybrid(m,&values[n]);
+    ivalue += styles[k]->data_vel_hybrid(m,values,ivalue);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.h b/src/KOKKOS/atom_vec_hybrid_kokkos.h
index 4c1907ddc6..567b98695f 100644
--- a/src/KOKKOS/atom_vec_hybrid_kokkos.h
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.h
@@ -57,9 +57,9 @@ class AtomVecHybridKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, imageint, char **);
-  int data_atom_hybrid(int, char **) {return 0;}
-  void data_vel(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int) {return 0;}
+  void data_vel(int, const std::vector<std::string> &);
   void pack_data(double **);
   void write_data(FILE *, int, double **);
   void pack_vel(double **);
diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp
index 1255712a21..931cd8d8f4 100644
--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@@ -1016,12 +1016,13 @@ void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
  *    unpack one line from Velocities section of data file
  *    ------------------------------------------------------------------------- */
 
-void AtomVecKokkos::data_vel(int m, char **values)
+void AtomVecKokkos::data_vel(int m, const std::vector<std::string> &values)
 {
   double **v = atom->v;
-  v[m][0] = utils::numeric(FLERR,values[0],true,lmp);
-  v[m][1] = utils::numeric(FLERR,values[1],true,lmp);
-  v[m][2] = utils::numeric(FLERR,values[2],true,lmp);
+  int ivalue = 1;
+  v[m][0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  v[m][1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  v[m][2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
 
   atomKK->modified(Host,V_MASK);
 }
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
index c596d80e94..20dd41dd75 100644
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -44,7 +44,7 @@ class AtomVecKokkos : public AtomVec {
   virtual void unpack_comm_vel(int, int, double *);
   virtual int pack_reverse(int, int, double *);
   virtual void unpack_reverse(int, int *, double *);
-  virtual void data_vel(int, char **);
+  virtual void data_vel(int, const std::vector<std::string> &);
   virtual void pack_vel(double **);
   virtual void write_vel(FILE *, int, double **);
 
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
index 016e3f80e3..c4e75c1da7 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@@ -1889,7 +1889,7 @@ void AtomVecMolecularKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecMolecularKokkos::data_atom(double *coord, imageint imagetmp,
-                                       char **values)
+                                       const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1924,9 +1924,10 @@ void AtomVecMolecularKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecMolecularKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecMolecularKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                             int offset)
 {
-  h_molecule(nlocal) = utils::inumeric(FLERR,values[0],true,lmp);
+  h_molecule(nlocal) = utils::inumeric(FLERR,values[offset],true,lmp);
   h_num_bond(nlocal) = 0;
   h_num_angle(nlocal) = 0;
   h_num_dihedral(nlocal) = 0;
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.h b/src/KOKKOS/atom_vec_molecular_kokkos.h
index fab833469c..abd04f905b 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.h
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.h
@@ -51,8 +51,8 @@ class AtomVecMolecularKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, tagint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.cpp b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
index 6b29af824a..0b722e8563 100644
--- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@@ -2543,7 +2543,8 @@ void AtomVecSphereKokkos::create_atom(int itype, double *coord)
    initialize other atom quantities
 ------------------------------------------------------------------------- */
 
-void AtomVecSphereKokkos::data_atom(double *coord, imageint imagetmp, char **values)
+void AtomVecSphereKokkos::data_atom(double *coord, imageint imagetmp,
+                                    const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -2590,13 +2591,14 @@ void AtomVecSphereKokkos::data_atom(double *coord, imageint imagetmp, char **val
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                          int offset)
 {
-  radius[nlocal] = 0.5 * utils::numeric(FLERR,values[0],true,lmp);
+  radius[nlocal] = 0.5 * utils::numeric(FLERR,values[offset],true,lmp);
   if (radius[nlocal] < 0.0)
     error->one(FLERR,"Invalid radius in Atoms section of data file");
 
-  double density = utils::numeric(FLERR,values[1],true,lmp);
+  double density = utils::numeric(FLERR,values[offset+1],true,lmp);
   if (density <= 0.0)
     error->one(FLERR,"Invalid density in Atoms section of data file");
 
@@ -2615,15 +2617,16 @@ int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, char **values)
    unpack one line from Velocities section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVecSphereKokkos::data_vel(int m, char **values)
+void AtomVecSphereKokkos::data_vel(int m, const std::vector<std::string> &values)
 {
+  int ivalue = 1;
   atomKK->sync(Host,V_MASK|OMEGA_MASK);
-  h_v(m,0) = utils::numeric(FLERR,values[0],true,lmp);
-  h_v(m,1) = utils::numeric(FLERR,values[1],true,lmp);
-  h_v(m,2) = utils::numeric(FLERR,values[2],true,lmp);
-  h_omega(m,0) = utils::numeric(FLERR,values[3],true,lmp);
-  h_omega(m,1) = utils::numeric(FLERR,values[4],true,lmp);
-  h_omega(m,2) = utils::numeric(FLERR,values[5],true,lmp);
+  h_v(m,0) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_v(m,1) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_v(m,2) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_omega(m,0) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_omega(m,1) = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  h_omega(m,2) = utils::numeric(FLERR,values[ivalue++],true,lmp);
   atomKK->modified(Host,V_MASK|OMEGA_MASK);
 }
 
@@ -2631,12 +2634,13 @@ void AtomVecSphereKokkos::data_vel(int m, char **values)
    unpack hybrid quantities from one line in Velocities section of data file
 ------------------------------------------------------------------------- */
 
-int AtomVecSphereKokkos::data_vel_hybrid(int m, char **values)
+int AtomVecSphereKokkos::data_vel_hybrid(int m, const std::vector<std::string> &values,
+                                         int offset)
 {
   atomKK->sync(Host,OMEGA_MASK);
-  omega[m][0] = utils::numeric(FLERR,values[0],true,lmp);
-  omega[m][1] = utils::numeric(FLERR,values[1],true,lmp);
-  omega[m][2] = utils::numeric(FLERR,values[2],true,lmp);
+  omega[m][0] = utils::numeric(FLERR,values[offset],true,lmp);
+  omega[m][1] = utils::numeric(FLERR,values[offset+1],true,lmp);
+  omega[m][2] = utils::numeric(FLERR,values[offset+2],true,lmp);
   atomKK->modified(Host,OMEGA_MASK);
   return 3;
 }
diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.h b/src/KOKKOS/atom_vec_sphere_kokkos.h
index 72eb7a665e..a9ba4baa24 100644
--- a/src/KOKKOS/atom_vec_sphere_kokkos.h
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.h
@@ -58,10 +58,10 @@ class AtomVecSphereKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, imageint, char **);
-  int data_atom_hybrid(int, char **);
-  void data_vel(int, char **);
-  int data_vel_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
+  void data_vel(int, const std::vector<std::string> &);
+  int data_vel_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
index c8f0663806..039c08f31c 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@@ -1056,7 +1056,7 @@ void AtomVecSpinKokkos::create_atom(int itype, double *coord)
 ------------------------------------------------------------------------- */
 
 void AtomVecSpinKokkos::data_atom(double *coord, imageint imagetmp,
-                                    char **values)
+                                  const std::vector<std::string> &values)
 {
   int nlocal = atom->nlocal;
   if (nlocal == nmax) grow(0);
@@ -1098,12 +1098,13 @@ void AtomVecSpinKokkos::data_atom(double *coord, imageint imagetmp,
    initialize other atom quantities for this sub-style
 ------------------------------------------------------------------------- */
 
-int AtomVecSpinKokkos::data_atom_hybrid(int nlocal, char **values)
+int AtomVecSpinKokkos::data_atom_hybrid(int nlocal, const std::vector<std::string> &values,
+                                        int offset)
 {
-  h_sp(nlocal,3) = utils::numeric(FLERR,values[0],true,lmp);
-  h_sp(nlocal,0) = utils::numeric(FLERR,values[1],true,lmp);
-  h_sp(nlocal,1) = utils::numeric(FLERR,values[2],true,lmp);
-  h_sp(nlocal,2) = utils::numeric(FLERR,values[3],true,lmp);
+  h_sp(nlocal,3) = utils::numeric(FLERR,values[offset],true,lmp);
+  h_sp(nlocal,0) = utils::numeric(FLERR,values[offset+1],true,lmp);
+  h_sp(nlocal,1) = utils::numeric(FLERR,values[offset+2],true,lmp);
+  h_sp(nlocal,2) = utils::numeric(FLERR,values[offset+3],true,lmp);
   double inorm = 1.0/sqrt(sp[nlocal][0]*sp[nlocal][0] +
                           sp[nlocal][1]*sp[nlocal][1] +
                           sp[nlocal][2]*sp[nlocal][2]);
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.h b/src/KOKKOS/atom_vec_spin_kokkos.h
index 6438da9eaa..38d206c007 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.h
+++ b/src/KOKKOS/atom_vec_spin_kokkos.h
@@ -45,8 +45,8 @@ class AtomVecSpinKokkos : public AtomVecKokkos {
   int pack_restart(int, double *);
   int unpack_restart(double *);
   void create_atom(int, double *);
-  void data_atom(double *, imageint, char **);
-  int data_atom_hybrid(int, char **);
+  void data_atom(double *, imageint, const std::vector<std::string> &);
+  int data_atom_hybrid(int, const std::vector<std::string> &, int);
   void pack_data(double **);
   int pack_data_hybrid(int, double *);
   void write_data(FILE *, int, double **);
diff --git a/src/KOKKOS/bond_fene_kokkos.cpp b/src/KOKKOS/bond_fene_kokkos.cpp
index 96daff4fed..e988262901 100644
--- a/src/KOKKOS/bond_fene_kokkos.cpp
+++ b/src/KOKKOS/bond_fene_kokkos.cpp
@@ -190,9 +190,9 @@ void BondFENEKokkos<DeviceType>::operator()(TagBondFENECompute<NEWTON_BOND,EVFLA
 
   if (rlogarg < 0.1) {
     if (!d_warning_flag())
-      Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+      d_warning_flag() = 1;
     if (rlogarg <= -3.0 && !d_error_flag())
-      Kokkos::atomic_fetch_add(&d_error_flag(),1);
+      d_error_flag() = 1;
     rlogarg = 0.1;
   }
 
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 888a1e105f..44e6a1c419 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -627,7 +627,7 @@ struct BuildExchangeListFunctor {
   KOKKOS_INLINE_FUNCTION
   void operator() (int i) const {
     if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
-      const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
+      const int mysend = Kokkos::atomic_fetch_add(&_nsend(),1);
       if (mysend < (int)_sendlist.extent(0)) {
         _sendlist(mysend) = i;
         _sendflag(i) = 1;
diff --git a/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp b/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp
new file mode 100644
index 0000000000..3f83c24fb6
--- /dev/null
+++ b/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp
@@ -0,0 +1,209 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_ave_sphere_atom_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+#include "update.h"
+#include "math_const.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeAveSphereAtomKokkos<DeviceType>::ComputeAveSphereAtomKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeAveSphereAtom(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeAveSphereAtomKokkos<DeviceType>::~ComputeAveSphereAtomKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_result,result);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeAveSphereAtomKokkos<DeviceType>::init()
+{
+  ComputeAveSphereAtom::init();
+
+  // need an occasional full neighbor list
+
+  // irequest = neigh request made by parent class
+
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = std::is_same<DeviceType,LMPHostType>::value &&
+    !std::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = std::is_same<DeviceType,LMPDeviceType>::value;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeAveSphereAtomKokkos<DeviceType>::compute_peratom()
+{
+  invoked_peratom = update->ntimestep;
+
+  // grow result array if necessary
+
+  if (atom->nmax > nmax) {
+    memoryKK->destroy_kokkos(k_result,result);
+    nmax = atom->nmax;
+    memoryKK->create_kokkos(k_result,result,nmax,2,"ave/sphere/atom:result");
+    d_result = k_result.view<DeviceType>();
+    array_atom = result;
+  }
+
+  // need velocities of ghost atoms
+
+  atomKK->sync(Host,V_MASK);
+  comm->forward_comm_compute(this);
+  atomKK->modified(Host,V_MASK);
+
+  // invoke full neighbor list (will copy or build if necessary)
+
+  neighbor->build_one(list);
+  int inum = list->inum;
+
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  // compute properties for each atom in group
+  // use full neighbor list to count atoms less than cutoff
+
+  atomKK->sync(execution_space,X_MASK|V_MASK|TYPE_MASK|MASK_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  v = atomKK->k_v.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+
+  Kokkos::deep_copy(d_result,0.0);
+
+  copymode = 1;
+  typename Kokkos::RangePolicy<DeviceType, TagComputeAveSphereAtom> policy(0,inum);
+  Kokkos::parallel_for("ComputeAveSphereAtom",policy,*this);
+  copymode = 0;
+
+  k_result.modify<DeviceType>();
+  k_result.sync_host();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeAveSphereAtomKokkos<DeviceType>::operator()(TagComputeAveSphereAtom, const int &ii) const
+{
+  const int i = d_ilist[ii];
+  if (mask[i] & groupbit) {
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int jnum = d_numneigh[i];
+
+    // i atom contribution
+
+    int count = 1;
+    double vsum[3];
+    vsum[0] = v(i,0);
+    vsum[1] = v(i,1);
+    vsum[2] = v(i,2);
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+
+      const F_FLOAT delx = x(j,0) - xtmp;
+      const F_FLOAT dely = x(j,1) - ytmp;
+      const F_FLOAT delz = x(j,2) - ztmp;
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq < cutsq) {
+        count++;
+        vsum[0] += v(j,0);
+        vsum[1] += v(j,1);
+        vsum[2] += v(j,2);
+      }
+    }
+
+    double vavg[3];
+    vavg[0] = vsum[0]/count;
+    vavg[1] = vsum[1]/count;
+    vavg[2] = vsum[2]/count;
+
+    // i atom contribution
+
+    count = 1;
+    double vnet[3];
+    vnet[0] = v(i,0) - vavg[0];
+    vnet[1] = v(i,1) - vavg[1];
+    vnet[2] = v(i,2) - vavg[2];
+    double ke_sum = vnet[0]*vnet[0] + vnet[1]*vnet[1] + vnet[2]*vnet[2];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+
+      const F_FLOAT delx = x(j,0) - xtmp;
+      const F_FLOAT dely = x(j,1) - ytmp;
+      const F_FLOAT delz = x(j,2) - ztmp;
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq < cutsq) {
+        count++;
+        vnet[0] = v(j,0) - vavg[0];
+        vnet[1] = v(j,1) - vavg[1];
+        vnet[2] = v(j,2) - vavg[2];
+        ke_sum += vnet[0]*vnet[0] + vnet[1]*vnet[1] + vnet[2]*vnet[2];
+      }
+    }
+    double density = count/sphere_vol;
+    double temp = ke_sum/3.0/count;
+    d_result(i,0) = density;
+    d_result(i,1) = temp;
+  }
+}
+
+namespace LAMMPS_NS {
+template class ComputeAveSphereAtomKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeAveSphereAtomKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/compute_ave_sphere_atom_kokkos.h b/src/KOKKOS/compute_ave_sphere_atom_kokkos.h
new file mode 100644
index 0000000000..42607e5239
--- /dev/null
+++ b/src/KOKKOS/compute_ave_sphere_atom_kokkos.h
@@ -0,0 +1,66 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(ave/sphere/atom/kk,ComputeAveSphereAtomKokkos<LMPDeviceType>)
+ComputeStyle(ave/sphere/atom/kk/device,ComputeAveSphereAtomKokkos<LMPDeviceType>)
+ComputeStyle(ave/sphere/atom/kk/host,ComputeAveSphereAtomKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_COMPUTE_AVE_SPHERE_ATOM_KOKKOS_H
+#define LMP_COMPUTE_AVE_SPHERE_ATOM_KOKKOS_H
+
+#include "compute_ave_sphere_atom.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct TagComputeAveSphereAtom{};
+
+template<class DeviceType>
+class ComputeAveSphereAtomKokkos : public ComputeAveSphereAtom {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeAveSphereAtomKokkos(class LAMMPS *, int, char **);
+  virtual ~ComputeAveSphereAtomKokkos();
+  void init();
+  void compute_peratom();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeAveSphereAtom, const int&) const;
+
+ private:
+  typename AT::t_x_array_randomread x;
+  typename AT::t_v_array_randomread v;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  DAT::tdual_float_2d k_result;
+  typename AT::t_float_2d d_result;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/dihedral_charmm_kokkos.cpp b/src/KOKKOS/dihedral_charmm_kokkos.cpp
index f4b3e8ce61..70a40b2d98 100644
--- a/src/KOKKOS/dihedral_charmm_kokkos.cpp
+++ b/src/KOKKOS/dihedral_charmm_kokkos.cpp
@@ -262,7 +262,7 @@ void DihedralCharmmKokkos<DeviceType>::operator()(TagDihedralCharmmCompute<NEWTO
     // error check
 
   if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-    Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+    d_warning_flag() = 1;
 
   if (c > 1.0) c = 1.0;
   if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/dihedral_class2_kokkos.cpp b/src/KOKKOS/dihedral_class2_kokkos.cpp
index 65c6903608..2ecaf3003e 100644
--- a/src/KOKKOS/dihedral_class2_kokkos.cpp
+++ b/src/KOKKOS/dihedral_class2_kokkos.cpp
@@ -280,7 +280,7 @@ void DihedralClass2Kokkos<DeviceType>::operator()(TagDihedralClass2Compute<NEWTO
   // error check
 
   if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-    Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+    d_warning_flag() = 1;
 
   if (c > 1.0) c = 1.0;
   if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/dihedral_harmonic_kokkos.cpp b/src/KOKKOS/dihedral_harmonic_kokkos.cpp
index a61a76496a..9beb16f17f 100644
--- a/src/KOKKOS/dihedral_harmonic_kokkos.cpp
+++ b/src/KOKKOS/dihedral_harmonic_kokkos.cpp
@@ -217,7 +217,7 @@ void DihedralHarmonicKokkos<DeviceType>::operator()(TagDihedralHarmonicCompute<N
   // error check
 
   if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-    Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+    d_warning_flag() = 1;
 
   if (c > 1.0) c = 1.0;
   if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/dihedral_opls_kokkos.cpp b/src/KOKKOS/dihedral_opls_kokkos.cpp
index 69faf0ee38..e4a4008c23 100644
--- a/src/KOKKOS/dihedral_opls_kokkos.cpp
+++ b/src/KOKKOS/dihedral_opls_kokkos.cpp
@@ -243,7 +243,7 @@ void DihedralOPLSKokkos<DeviceType>::operator()(TagDihedralOPLSCompute<NEWTON_BO
   // error check
 
   if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-    Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+    d_warning_flag() = 1;
 
   if (c > 1.0) c = 1.0;
   if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/improper_class2_kokkos.cpp b/src/KOKKOS/improper_class2_kokkos.cpp
index e58ddb49e3..c2da884558 100644
--- a/src/KOKKOS/improper_class2_kokkos.cpp
+++ b/src/KOKKOS/improper_class2_kokkos.cpp
@@ -280,10 +280,10 @@ void ImproperClass2Kokkos<DeviceType>::operator()(TagImproperClass2Compute<NEWTO
 
     /*
     if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-      Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+      d_warning_flag() = 1;
     */
     if ((costheta[0] == -1.0 || costheta[1] == -1.0 || costheta[2] == -1.0) && !d_warning_flag())
-      Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+      d_warning_flag() = 1;
 
     if (c > 1.0) c = 1.0;
     if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/improper_harmonic_kokkos.cpp b/src/KOKKOS/improper_harmonic_kokkos.cpp
index 8dfe891f26..77f24a7cf7 100644
--- a/src/KOKKOS/improper_harmonic_kokkos.cpp
+++ b/src/KOKKOS/improper_harmonic_kokkos.cpp
@@ -208,7 +208,7 @@ void ImproperHarmonicKokkos<DeviceType>::operator()(TagImproperHarmonicCompute<N
   // error check
 
   if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
-    Kokkos::atomic_fetch_add(&d_warning_flag(),1);
+    d_warning_flag() = 1;
 
   if (c > 1.0) c = 1.0;
   if (c < -1.0) c = -1.0;
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index 4c0dea819c..f0dc026df8 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -236,7 +236,7 @@ typedef Kokkos::CudaHostPinnedSpace LMPPinnedHostType;
 #elif defined(KOKKOS_ENABLE_HIP)
 typedef Kokkos::Experimental::HIPHostPinnedSpace LMPPinnedHostType;
 #elif defined(KOKKOS_ENABLE_SYCL)
-typedef Kokkos::Experimental::SYCLSharedUSMSpace LMPPinnedHostType;
+typedef Kokkos::Experimental::SYCLHostUSMSpace LMPPinnedHostType;
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
 typedef Kokkos::Serial LMPPinnedHostType;
 #endif
diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
index d42b88ab79..b91ff7bd41 100644
--- a/src/KOKKOS/nbin_ssa_kokkos.cpp
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -228,12 +228,12 @@ void NBinSSAKokkos<DeviceType>::binIDAtomsItem(const int &i, int &update) const
   binID(i) = ibin;
 
   // Find the bounding box of the local atoms in the bins
-  if (loc[0] < d_lbinxlo()) Kokkos::atomic_fetch_min(&d_lbinxlo(),loc[0]);
-  if (loc[0] >= d_lbinxhi()) Kokkos::atomic_fetch_max(&d_lbinxhi(),loc[0] + 1);
-  if (loc[1] < d_lbinylo()) Kokkos::atomic_fetch_min(&d_lbinylo(),loc[1]);
-  if (loc[1] >= d_lbinyhi()) Kokkos::atomic_fetch_max(&d_lbinyhi(),loc[1] + 1);
-  if (loc[2] < d_lbinzlo()) Kokkos::atomic_fetch_min(&d_lbinzlo(),loc[2]);
-  if (loc[2] >= d_lbinzhi()) Kokkos::atomic_fetch_max(&d_lbinzhi(),loc[2] + 1);
+  if (loc[0] < d_lbinxlo()) Kokkos::atomic_min(&d_lbinxlo(),loc[0]);
+  if (loc[0] >= d_lbinxhi()) Kokkos::atomic_max(&d_lbinxhi(),loc[0] + 1);
+  if (loc[1] < d_lbinylo()) Kokkos::atomic_min(&d_lbinylo(),loc[1]);
+  if (loc[1] >= d_lbinyhi()) Kokkos::atomic_max(&d_lbinyhi(),loc[1] + 1);
+  if (loc[2] < d_lbinzlo()) Kokkos::atomic_min(&d_lbinzlo(),loc[2]);
+  if (loc[2] >= d_lbinzhi()) Kokkos::atomic_max(&d_lbinzhi(),loc[2] + 1);
 
   const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
   if (update <= ac) update = ac + 1;
diff --git a/src/KOKKOS/neigh_bond_kokkos.cpp b/src/KOKKOS/neigh_bond_kokkos.cpp
index a4cb6e7d40..c3d5eb1725 100644
--- a/src/KOKKOS/neigh_bond_kokkos.cpp
+++ b/src/KOKKOS/neigh_bond_kokkos.cpp
@@ -291,7 +291,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondBondAll, const int &i,
     if (newton_bond || i < atom1) {
       const int nbondlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nbondlist >= maxbond && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_bondlist(nbondlist,0) = i;
       v_bondlist(nbondlist,1) = atom1;
@@ -379,7 +379,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondBondPartial, const int
     if (newton_bond || i < atom1) {
       const int nbondlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nbondlist >= maxbond && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_bondlist(nbondlist,0) = i;
       v_bondlist(nbondlist,1) = atom1;
@@ -495,7 +495,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondAngleAll, const int &i,
     if (newton_bond || (i <= atom1 && i <= atom2 && i <= atom3)) {
       const int nanglelist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nanglelist >= maxangle && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_anglelist(nanglelist,0) = atom1;
       v_anglelist(nanglelist,1) = atom2;
@@ -590,7 +590,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondAnglePartial, const int
     if (newton_bond || (i <= atom1 && i <= atom2 && i <= atom3)) {
       const int nanglelist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nanglelist >= maxangle && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_anglelist(nanglelist,0) = atom1;
       v_anglelist(nanglelist,1) = atom2;
@@ -725,7 +725,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondDihedralAll, const int
         (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4)) {
       const int ndihedrallist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (ndihedrallist >= maxdihedral && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_dihedrallist(ndihedrallist,0) = atom1;
       v_dihedrallist(ndihedrallist,1) = atom2;
@@ -825,7 +825,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondDihedralPartial, const
         (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4)) {
       const int ndihedrallist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (ndihedrallist >= maxdihedral && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_dihedrallist(ndihedrallist,0) = atom1;
       v_dihedrallist(ndihedrallist,1) = atom2;
@@ -979,7 +979,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondImproperAll, const int
         (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4)) {
       const int nimproperlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nimproperlist >= maximproper && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_improperlist(nimproperlist,0) = atom1;
       v_improperlist(nimproperlist,1) = atom2;
@@ -1079,7 +1079,7 @@ void NeighBondKokkos<DeviceType>::operator()(TagNeighBondImproperPartial, const
         (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4)) {
       const int nimproperlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
       if (nimproperlist >= maximproper && !d_fail_flag())
-        Kokkos::atomic_fetch_add(&d_fail_flag(),1);
+        d_fail_flag() = 1;
       if (d_fail_flag()) continue;
       v_improperlist(nimproperlist,0) = atom1;
       v_improperlist(nimproperlist,1) = atom2;
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
index 0246d141fb..daa689d483 100644
--- a/src/KOKKOS/npair_ssa_kokkos.cpp
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -613,7 +613,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals_onePhase(const bool firstTr
           neigh_list.d_ilist(inum++) = i;
           if (n > neigh_list.maxneighs) {
             resize() = 1;
-            if (n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+            if (n > new_maxneighs()) Kokkos::atomic_max(&new_maxneighs(),n);
           }
         }
       }
@@ -741,7 +741,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_ghosts_onePhase(int workPhase) con
           neigh_list.d_ilist(gNdx++) = i;
           if (n > neigh_list.maxneighs) {
             resize() = 1;
-            if (n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+            if (n > new_maxneighs()) Kokkos::atomic_max(&new_maxneighs(),n);
           }
         }
       }
diff --git a/src/MACHDYN/atom_vec_smd.cpp b/src/MACHDYN/atom_vec_smd.cpp
index b798425e19..81f32ca0a2 100644
--- a/src/MACHDYN/atom_vec_smd.cpp
+++ b/src/MACHDYN/atom_vec_smd.cpp
@@ -115,6 +115,7 @@ void AtomVecSMD::grow_pointers()
   vfrac = atom->vfrac;
   rmass = atom->rmass;
   x0 = atom->x0;
+  x = atom->x;
   radius = atom->radius;
   contact_radius = atom->contact_radius;
   molecule = atom->molecule;
@@ -129,13 +130,11 @@ void AtomVecSMD::grow_pointers()
 /* ----------------------------------------------------------------------
    clear extra forces starting at atom N
    nbytes = # of bytes to clear for a per-atom vector
-   NOTE: does f need to be re-cleared?
 ------------------------------------------------------------------------- */
 
 void AtomVecSMD::force_clear(int n, size_t nbytes)
 {
   memset(&desph[n],0,nbytes);
-  memset(&f[n][0],0,3*nbytes);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/MACHDYN/compute_smd_triangle_vertices.cpp b/src/MACHDYN/compute_smd_triangle_vertices.cpp
index 016ae3e85f..89a2a24b89 100644
--- a/src/MACHDYN/compute_smd_triangle_vertices.cpp
+++ b/src/MACHDYN/compute_smd_triangle_vertices.cpp
@@ -53,7 +53,7 @@ ComputeSMDTriangleVertices::ComputeSMDTriangleVertices(LAMMPS *lmp, int narg, ch
 /* ---------------------------------------------------------------------- */
 
 ComputeSMDTriangleVertices::~ComputeSMDTriangleVertices() {
-    memory->sfree(outputVector);
+    memory->destroy(outputVector);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/MACHDYN/pair_smd_ulsph.cpp b/src/MACHDYN/pair_smd_ulsph.cpp
index 0958cbc7c3..e9eac13126 100644
--- a/src/MACHDYN/pair_smd_ulsph.cpp
+++ b/src/MACHDYN/pair_smd_ulsph.cpp
@@ -85,7 +85,8 @@ PairULSPH::PairULSPH(LAMMPS *lmp) :
 
 PairULSPH::~PairULSPH() {
         if (allocated) {
-                //printf("... deallocating\n");
+                memory->destroy(setflag);
+                memory->destroy(cutsq);
                 memory->destroy(Q1);
                 memory->destroy(rho0);
                 memory->destroy(eos);
diff --git a/src/MANYBODY/pair_eam_cd.cpp b/src/MANYBODY/pair_eam_cd.cpp
index de40cb2568..46d439b879 100644
--- a/src/MANYBODY/pair_eam_cd.cpp
+++ b/src/MANYBODY/pair_eam_cd.cpp
@@ -20,9 +20,6 @@
 
 #include "pair_eam_cd.h"
 
-#include <cmath>
-
-#include <cstring>
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
@@ -31,11 +28,11 @@
 #include "error.h"
 #include "tokenizer.h"
 
-
+#include <cmath>
+#include <cstring>
 
 using namespace LAMMPS_NS;
 
-#define ASSERT(cond)
 #define MAXLINE 1024        // This sets the maximum line length in EAM input files.
 
 PairEAMCD::PairEAMCD(LAMMPS *lmp, int _cdeamVersion)
@@ -298,7 +295,7 @@ void PairEAMCD::compute(int eflag, int vflag)
     // It will be replaced by the concentration at site i if atom i is either A or B.
 
     double x_i = -1.0;
-    double D_i, h_prime_i;
+    double D_i = 0.0, h_prime_i;
 
     // This if-clause is only required for ternary alloys.
 
@@ -307,7 +304,6 @@ void PairEAMCD::compute(int eflag, int vflag)
       // Compute local concentration at site i.
 
       x_i = rhoB[i]/rho[i];
-      ASSERT(x_i >= 0 && x_i<=1.0);
 
       if (cdeamVersion == 1) {
 
@@ -317,8 +313,6 @@ void PairEAMCD::compute(int eflag, int vflag)
         D_i = D_values[i] * h_prime_i / (2.0 * rho[i] * rho[i]);
       } else if (cdeamVersion == 2) {
         D_i = D_values[i];
-      } else {
-        ASSERT(false);
       }
     }
 
@@ -354,14 +348,11 @@ void PairEAMCD::compute(int eflag, int vflag)
 
         // This code line is required for ternary alloy.
 
-        if (jtype == speciesA || jtype == speciesB) {
-          ASSERT(rho[i] != 0.0);
-          ASSERT(rho[j] != 0.0);
+        if ((jtype == speciesA || jtype == speciesB) && rho[j] != 0.0) {
 
           // Compute local concentration at site j.
 
           x_j = rhoB[j]/rho[j];
-          ASSERT(x_j >= 0 && x_j<=1.0);
 
           double D_j=0.0;
           if (cdeamVersion == 1) {
@@ -372,8 +363,6 @@ void PairEAMCD::compute(int eflag, int vflag)
             D_j = D_values[j] * h_prime_j / (2.0 * rho[j] * rho[j]);
           } else if (cdeamVersion == 2) {
             D_j = D_values[j];
-          } else {
-            ASSERT(false);
           }
           double t2 = -rhoB[j];
           if (itype == speciesB) t2 += rho[j];
@@ -422,8 +411,6 @@ void PairEAMCD::compute(int eflag, int vflag)
             // Calculate h(x_ij) polynomial function.
 
             h = evalH(x_ij);
-          } else {
-            ASSERT(false);
           }
           fpair += h * phip;
           phi *= h;
@@ -460,7 +447,8 @@ void PairEAMCD::coeff(int narg, char **arg)
   // Make sure the EAM file is a CD-EAM binary alloy.
 
   if (setfl->nelements < 2)
-    error->all(FLERR,"The EAM file must contain at least 2 elements to be used with the eam/cd pair style.");
+    error->all(FLERR,"The EAM file must contain at least 2 elements to be "
+                    "used with the eam/cd pair style.");
 
   // Read in the coefficients of the h polynomial from the end of the EAM file.
 
@@ -502,22 +490,28 @@ void PairEAMCD::read_h_coeff(char *filename)
     // Open potential file
 
     FILE *fptr;
-    char line[MAXLINE];
-    char nextline[MAXLINE];
     int convert_flag = unit_convert_flag;
     fptr = utils::open_potential(filename, lmp, &convert_flag);
     if (fptr == nullptr)
-      error->one(FLERR,"Cannot open EAMCD potential file {}",
-                                   filename);
+      error->one(FLERR,"Cannot open EAMCD potential file {}", filename);
 
     // h coefficients are stored at the end of the file.
-    // Skip to last line of file.
+    // Seek to end of file, read last part into a buffer and
+    // then skip over lines in buffer until reaching the end.
 
-    while (fgets(nextline, MAXLINE, fptr) != nullptr) {
-      strcpy(line, nextline);
-    }
+    platform::fseek(fptr, platform::END_OF_FILE);
+    platform::fseek(fptr, platform::ftell(fptr) - MAXLINE);
+    char *buf = new char[MAXLINE+1];
+    fread(buf, 1, MAXLINE, fptr);
+    buf[MAXLINE] = '\0';        // must 0-terminate buffer for string processing
+    Tokenizer lines(buf, "\n");
+    delete[] buf;
 
-    ValueTokenizer values(line);
+    std::string lastline;
+    while (lines.has_next())
+      lastline = lines.next();
+
+    ValueTokenizer values(lastline);
     int degree = values.next_int();
     nhcoeff = degree+1;
 
@@ -527,10 +521,8 @@ void PairEAMCD::read_h_coeff(char *filename)
     delete[] hcoeff;
     hcoeff = new double[nhcoeff];
 
-    int i = 0;
-    while (values.has_next()) {
-      hcoeff[i++] = values.next_double();
-    }
+    for (int i = 0; i < nhcoeff; ++i)
+      hcoeff[i] = values.next_double();
 
     // Close the potential file.
 
@@ -545,7 +537,6 @@ void PairEAMCD::read_h_coeff(char *filename)
   MPI_Bcast(hcoeff, nhcoeff, MPI_DOUBLE, 0, world);
 }
 
-
 /* ---------------------------------------------------------------------- */
 
 int PairEAMCD::pack_forward_comm(int n, int *list, double *buf,
@@ -572,7 +563,7 @@ int PairEAMCD::pack_forward_comm(int n, int *list, double *buf,
         buf[m++] = rhoB[j];
       }
       return m;
-    } else { ASSERT(false); return 0; }
+    } else return 0;
   } else if (communicationStage == 4) {
     for (i = 0; i < n; i++) {
       j = list[i];
@@ -604,8 +595,6 @@ void PairEAMCD::unpack_forward_comm(int n, int first, double *buf)
         rho[i] = buf[m++];
         rhoB[i] = buf[m++];
       }
-    } else {
-      ASSERT(false);
     }
   } else if (communicationStage == 4) {
     for (i = first; i < last; i++) {
@@ -636,7 +625,7 @@ int PairEAMCD::pack_reverse_comm(int n, int first, double *buf)
         buf[m++] = rhoB[i];
       }
       return m;
-    } else { ASSERT(false); return 0; }
+    } else return 0;
   } else if (communicationStage == 3) {
     for (i = first; i < last; i++) {
       buf[m++] = D_values[i];
@@ -666,8 +655,6 @@ void PairEAMCD::unpack_reverse_comm(int n, int *list, double *buf)
         rho[j] += buf[m++];
         rhoB[j] += buf[m++];
       }
-    } else {
-      ASSERT(false);
     }
   } else if (communicationStage == 3) {
     for (i = 0; i < n; i++) {
diff --git a/src/MANYBODY/pair_eam_cd.h b/src/MANYBODY/pair_eam_cd.h
index 77e909f48b..6846a6cd76 100644
--- a/src/MANYBODY/pair_eam_cd.h
+++ b/src/MANYBODY/pair_eam_cd.h
@@ -120,6 +120,7 @@ class PairEAMCD : public PairEAMAlloy {
     index.p = r * rdr + 1.0;
     index.m = static_cast<int>(index.p);
     index.m = index.m <= (nr - 1) ? index.m : (nr - 1);
+    index.m = index.m > 1 ? index.m : 1;
     index.p -= index.m;
     index.p = index.p <= 1.0 ? index.p : 1.0;
     return index;
@@ -132,6 +133,7 @@ class PairEAMCD : public PairEAMAlloy {
     index.p = rho * rdrho + 1.0;
     index.m = static_cast<int>(index.p);
     index.m = index.m <= (nrho - 1) ? index.m : (nrho - 1);
+    index.m = index.m > 1 ? index.m : 1;
     index.p -= index.m;
     index.p = index.p <= 1.0 ? index.p : 1.0;
     return index;
diff --git a/src/MANYBODY/pair_local_density.cpp b/src/MANYBODY/pair_local_density.cpp
index 5fefd33dea..3fe12bb71d 100644
--- a/src/MANYBODY/pair_local_density.cpp
+++ b/src/MANYBODY/pair_local_density.cpp
@@ -28,6 +28,7 @@
 #include "memory.h"
 #include "neigh_list.h"
 #include "neighbor.h"
+#include "potential_file_reader.h"
 
 #include <cstring>
 
@@ -86,9 +87,8 @@ PairLocalDensity::PairLocalDensity(LAMMPS *lmp) : Pair(lmp)
   fp = nullptr;
   localrho = nullptr;
 
-  // set comm size needed by this pair
-  comm_forward = 1;
-  comm_reverse = 1;
+  // comm sizes needed by this pair style will be set when reading the potential file
+  comm_forward = comm_reverse = 0;
 
   // cite publication
   if (lmp->citeme) lmp->citeme->add(cite_pair_local_density);
@@ -657,35 +657,37 @@ void PairLocalDensity::interpolate_cbspl(int n, double delta,
 
 void PairLocalDensity::parse_file(char *filename) {
 
-  int k, n;
-  int me = comm->me;
-  FILE *fptr;
-  char line[MAXLINE];
-  double ratio, lc2, uc2, denom;
+  // parse potential file header
+  if (comm->me == 0) {
+    PotentialFileReader reader(lmp, filename, "local/density");
 
-  if (me == 0) {
-    fptr = fopen(filename, "r");
-    if (fptr == nullptr)
-      error->one(FLERR,"Cannot open Local Density potential file {}: {}",filename,utils::getsyserror());
+    try {
+
+      // ignore first 2 comment lines
+      reader.skip_line();
+      reader.skip_line();
+
+      // extract number of potentials and number of (frho, rho) points
+      ValueTokenizer values = reader.next_values(2);
+      nLD = values.next_int();
+      nrho = values.next_int();
+
+      const int numld = atom->ntypes*atom->ntypes;
+      if (nLD != numld)
+        error->warning(FLERR, "Expected {} local density potentials but got {}",numld, nLD);
+
+    } catch (TokenizerException &e) {
+      error->one(FLERR, e.what());
+    }
   }
 
-  double *ftmp; // tmp var to extract the complete 2D frho array from file
-
-  // broadcast number of LD potentials and number of (rho,frho) pairs
-  if (me == 0) {
-
-    // first 2 comment lines ignored
-    utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
-    utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
-
-    // extract number of potentials and number of (frho, rho) points
-    utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
-    sscanf(line, "%d %d", &nLD, &nrho);
-    utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
-  }
+  // broadcast number of LD potentials and number of (rho,frho) pairs and allocate storage
 
   MPI_Bcast(&nLD,1,MPI_INT,0,world);
   MPI_Bcast(&nrho,1,MPI_INT,0,world);
+  comm_forward = comm_reverse = nLD;
+
+  double *ftmp; // tmp var to extract the complete 2D frho array from file
 
   // setting up all arrays to be read from files and broadcasted
   memory->create(uppercut, nLD, "pairLD:uppercut");
@@ -704,54 +706,65 @@ void PairLocalDensity::parse_file(char *filename) {
   // setting up central and neighbor atom filters
   memory->create(a, nLD, atom->ntypes+1 , "pairLD:a");
   memory->create(b, nLD, atom->ntypes+1, "pairLD:b");
-  if (me == 0) {
-    for (n = 1; n <= atom->ntypes; n++) {
-        for (k = 0; k < nLD; k++) {
-            a[k][n] = 0;
-            b[k][n] = 0;
-        }
+  for (int k = 0; k < nLD; k++) {
+    for (int n = 1; n <= atom->ntypes; n++) {
+      a[k][n] = 0;
+      b[k][n] = 0;
     }
   }
 
- // read file block by block
+  // parse potential file body
+  if (comm->me == 0) {
+    PotentialFileReader reader(lmp, filename, "local/density");
 
-  if (me == 0) {
-    for (k = 0; k < nLD; k++) {
+    try {
+      double ratio, lc2, uc2, denom;
+      ValueTokenizer values("");
+
+      // ignore first 4 lines already processed
+
+      reader.skip_line();
+      reader.skip_line();
+      reader.skip_line();
+      reader.skip_line();
+
+      for (int k = 0; k < nLD; k++) {
 
         // parse upper and lower cut values
-        if (fgets(line,MAXLINE,fptr)==nullptr) break;
-        sscanf(line, "%lf %lf", &lowercut[k], &uppercut[k]);
+        values = reader.next_values(2);
+        lowercut[k] = values.next_double();
+        uppercut[k] = values.next_double();
 
-        // parse and broadcast central atom filter
-        utils::sfgets(FLERR,line, MAXLINE, fptr,filename,error);
-        char *tmp = strtok(line, " /t/n/r/f");
-        while (tmp != nullptr) {
-            a[k][atoi(tmp)] = 1;
-            tmp = strtok(nullptr, " /t/n/r/f");
+        // parse central atom filter
+        values = ValueTokenizer(reader.next_line());
+        while (values.has_next()) {
+          int atype = values.next_int();
+          if ((atype < 1) || (atype > atom->ntypes))
+            throw TokenizerException("Invalid atom type filter value",std::to_string(atype));
+          a[k][atype] = 1;
         }
 
         // parse neighbor atom filter
-        utils::sfgets(FLERR,line, MAXLINE, fptr,filename,error);
-        tmp = strtok(line, " /t/n/r/f");
-        while (tmp != nullptr) {
-            b[k][atoi(tmp)] = 1;
-            tmp = strtok(nullptr, " /t/n/r/f");
+        values = ValueTokenizer(reader.next_line());
+        while (values.has_next()) {
+          int btype = values.next_int();
+          if ((btype < 1) || (btype > atom->ntypes))
+            throw TokenizerException("Invalid atom type filter value",std::to_string(btype));
+          b[k][btype] = 1;
         }
 
         // parse min, max and delta rho values
-        utils::sfgets(FLERR,line, MAXLINE, fptr,filename,error);
-        sscanf(line, "%lf %lf %lf", &rho_min[k], &rho_max[k], &delta_rho[k]);
+        values = reader.next_values(3);
+        rho_min[k] = values.next_double();
+        rho_max[k] = values.next_double();
         // recompute delta_rho from scratch for precision
         delta_rho[k] = (rho_max[k] - rho_min[k]) / (nrho - 1);
 
         // parse tabulated frho values from each line into temporary array
-        for (n = 0; n < nrho; n++) {
-          utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
-            sscanf(line, "%lf", &ftmp[k*nrho + n]);
-        }
+        reader.next_dvector(ftmp+k*nrho, nrho);
 
         // ignore blank line at the end of every block
-        utils::sfgets(FLERR,line,MAXLINE,fptr,filename,error);
+        reader.skip_line();
 
         // set coefficients for local density indicator function
         uc2 = uppercut[k] * uppercut[k];
@@ -766,6 +779,10 @@ void PairLocalDensity::parse_file(char *filename) {
         c4[k] = -(3.0 + 3.0*ratio) / (uc2*uc2 * denom);
         c6[k] = 2.0 / (uc2*uc2*uc2 * denom);
       }
+
+    } catch (TokenizerException &e) {
+      error->one(FLERR, e.what());
+    }
   }
 
   // Broadcast all parsed arrays
@@ -777,7 +794,7 @@ void PairLocalDensity::parse_file(char *filename) {
   MPI_Bcast(&c2[0], nLD, MPI_DOUBLE, 0, world);
   MPI_Bcast(&c4[0], nLD, MPI_DOUBLE, 0, world);
   MPI_Bcast(&c6[0], nLD, MPI_DOUBLE, 0, world);
-  for (k = 0; k < nLD; k++) {
+  for (int k = 0; k < nLD; k++) {
       MPI_Bcast(&a[k][1], atom->ntypes, MPI_INT, 0, world);
       MPI_Bcast(&b[k][1], atom->ntypes, MPI_INT, 0, world);
   }
@@ -786,14 +803,12 @@ void PairLocalDensity::parse_file(char *filename) {
   MPI_Bcast(&delta_rho[0], nLD, MPI_DOUBLE, 0, world);
   MPI_Bcast(&ftmp[0], nLD*nrho, MPI_DOUBLE, 0, world);
 
-  if (me == 0) fclose(fptr);
-
   // set up rho and frho arrays
   memory->create(rho, nLD, nrho, "pairLD:rho");
   memory->create(frho, nLD, nrho, "pairLD:frho");
 
-  for (k = 0; k < nLD; k++) {
-    for (n = 0; n < nrho; n++) {
+  for (int k = 0; k < nLD; k++) {
+    for (int n = 0; n < nrho; n++) {
         rho[k][n] = rho_min[k] + n*delta_rho[k];
         frho[k][n] = ftmp[k*nrho + n];
     }
@@ -807,8 +822,8 @@ void PairLocalDensity::parse_file(char *filename) {
    communication routines
 ------------------------------------------------------------------------- */
 
-int PairLocalDensity::pack_comm(int n, int *list, double *buf,
-                                int /* pbc_flag */, int * /* pbc */) {
+int PairLocalDensity::pack_forward_comm(int n, int *list, double *buf,
+                                        int /* pbc_flag */, int * /* pbc */) {
   int i,j,k;
   int m;
 
@@ -820,12 +835,12 @@ int PairLocalDensity::pack_comm(int n, int *list, double *buf,
     }
   }
 
-  return nLD;
+  return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void PairLocalDensity::unpack_comm(int n, int first, double *buf) {
+void PairLocalDensity::unpack_forward_comm(int n, int first, double *buf) {
 
   int i,k,m,last;
 
@@ -851,7 +866,7 @@ int PairLocalDensity::pack_reverse_comm(int n, int first, double *buf) {
       buf[m++] = localrho[k][i];
     }
   }
-  return nLD;
+  return m;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -881,4 +896,3 @@ double PairLocalDensity::memory_usage()
   bytes += (double)2 * (nmax*nLD) * sizeof(double);
   return bytes;
 }
-
diff --git a/src/MANYBODY/pair_local_density.h b/src/MANYBODY/pair_local_density.h
index ab968bdc4a..ef67d2c084 100644
--- a/src/MANYBODY/pair_local_density.h
+++ b/src/MANYBODY/pair_local_density.h
@@ -39,8 +39,8 @@ class PairLocalDensity : public Pair {
   double init_one(int, int);
   double single(int, int, int, int, double, double, double, double &);
 
-  virtual int pack_comm(int, int *, double *, int, int *);
-  virtual void unpack_comm(int, int, double *);
+  virtual int pack_forward_comm(int, int *, double *, int, int *);
+  virtual void unpack_forward_comm(int, int, double *);
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
diff --git a/src/MANYBODY/pair_sw.cpp b/src/MANYBODY/pair_sw.cpp
index fcdda9ef9e..75ece1dc71 100644
--- a/src/MANYBODY/pair_sw.cpp
+++ b/src/MANYBODY/pair_sw.cpp
@@ -293,7 +293,7 @@ void PairSW::read_file(char *file)
 
   if (comm->me == 0) {
     PotentialFileReader reader(lmp, file, "sw", unit_convert_flag);
-    char *line;
+    char * line;
 
     // transparently convert units for supported conversions
 
@@ -328,7 +328,8 @@ void PairSW::read_file(char *file)
 
         if (nparams == maxparam) {
           maxparam += DELTA;
-          params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),"pair:params");
+          params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),
+                                              "pair:params");
 
           // make certain all addional allocated storage is initialized
           // to avoid false positives when checking with valgrind
diff --git a/src/MANYBODY/pair_sw.h b/src/MANYBODY/pair_sw.h
index 5f6f51b57c..6509c460d6 100644
--- a/src/MANYBODY/pair_sw.h
+++ b/src/MANYBODY/pair_sw.h
@@ -29,7 +29,6 @@ class PairSW : public Pair {
   PairSW(class LAMMPS *);
   virtual ~PairSW();
   virtual void compute(int, int);
-  void settings(int, char **);
   virtual void coeff(int, char **);
   virtual double init_one(int, int);
   virtual void init_style();
@@ -54,11 +53,12 @@ class PairSW : public Pair {
   int maxshort;       // size of short neighbor list array
   int *neighshort;    // short neighbor list array
 
+  virtual void settings(int, char **);
   virtual void allocate();
   void read_file(char *);
   virtual void setup_params();
   void twobody(Param *, double, double &, int, double &);
-  void threebody(Param *, Param *, Param *, double, double, double *, double *, double *, double *,
+  virtual void threebody(Param *, Param *, Param *, double, double, double *, double *, double *, double *,
                  int, double &);
 };
 
diff --git a/src/MANYBODY/pair_sw_mod.cpp b/src/MANYBODY/pair_sw_mod.cpp
new file mode 100644
index 0000000000..ce24952fc7
--- /dev/null
+++ b/src/MANYBODY/pair_sw_mod.cpp
@@ -0,0 +1,123 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Jin-Wu Jiang (Shanghai U) and Wengen Ouyang (Wuhan U)
+------------------------------------------------------------------------- */
+
+#include "pair_sw_mod.h"
+
+#include "error.h"
+#include "math_const.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairSWMOD::PairSWMOD(LAMMPS *lmp) : PairSW(lmp)
+{
+  delta1 = 0.25;
+  delta2 = 0.35;
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSWMOD::settings(int narg, char **arg)
+{
+  // process optional keywords
+
+  int iarg = 0;
+
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"maxdelcs") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal pair_style command");
+      delta1 = utils::numeric(FLERR,arg[iarg+1],false,lmp);
+      delta2 = utils::numeric(FLERR,arg[iarg+2],false,lmp);
+      iarg += 3;
+      if ((delta1 < 0.0) || (delta1 > 1.0) || (delta2 < 0.0) || (delta2 > 1.0) || (delta1 > delta2))
+        error->all(FLERR,"Illegal values for maxdelcs keyword");
+    } else error->all(FLERR,"Illegal pair_style command");
+  }
+  PairSW::settings(narg-iarg,arg+iarg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSWMOD::threebody(Param *paramij, Param *paramik, Param *paramijk,
+                       double rsq1, double rsq2,
+                       double *delr1, double *delr2,
+                       double *fj, double *fk, int eflag, double &eng)
+{
+  double r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1;
+  double r2,rinvsq2,rainv2,gsrainv2,gsrainvsq2,expgsrainv2;
+  double rinv12,cs,delcs,delcssq,facexp,facrad,frad1,frad2;
+  double facang,facang12,csfacang,csfac1,csfac2,factor;
+
+  r1 = sqrt(rsq1);
+  rinvsq1 = 1.0/rsq1;
+  rainv1 = 1.0/(r1 - paramij->cut);
+  gsrainv1 = paramij->sigma_gamma * rainv1;
+  gsrainvsq1 = gsrainv1*rainv1/r1;
+  expgsrainv1 = exp(gsrainv1);
+
+  r2 = sqrt(rsq2);
+  rinvsq2 = 1.0/rsq2;
+  rainv2 = 1.0/(r2 - paramik->cut);
+  gsrainv2 = paramik->sigma_gamma * rainv2;
+  gsrainvsq2 = gsrainv2*rainv2/r2;
+  expgsrainv2 = exp(gsrainv2);
+
+  rinv12 = 1.0/(r1*r2);
+  cs = (delr1[0]*delr2[0] + delr1[1]*delr2[1] + delr1[2]*delr2[2]) * rinv12;
+  delcs = cs - paramijk->costheta;
+
+  // Modification to delcs
+  if(fabs(delcs) >= delta2) delcs = 0.0;
+  else if(fabs(delcs) < delta2 && fabs(delcs) > delta1) {
+    factor = 0.5 + 0.5*cos(MY_PI*(fabs(delcs) - delta1)/(delta2 - delta1));
+    delcs *= factor;
+  }
+  delcssq = delcs*delcs;
+
+  facexp = expgsrainv1*expgsrainv2;
+
+  // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
+  //          facexp*delcssq;
+
+  facrad = paramijk->lambda_epsilon * facexp*delcssq;
+  frad1 = facrad*gsrainvsq1;
+  frad2 = facrad*gsrainvsq2;
+  facang = paramijk->lambda_epsilon2 * facexp*delcs;
+  facang12 = rinv12*facang;
+  csfacang = cs*facang;
+  csfac1 = rinvsq1*csfacang;
+
+  fj[0] = delr1[0]*(frad1+csfac1)-delr2[0]*facang12;
+  fj[1] = delr1[1]*(frad1+csfac1)-delr2[1]*facang12;
+  fj[2] = delr1[2]*(frad1+csfac1)-delr2[2]*facang12;
+
+  csfac2 = rinvsq2*csfacang;
+
+  fk[0] = delr2[0]*(frad2+csfac2)-delr1[0]*facang12;
+  fk[1] = delr2[1]*(frad2+csfac2)-delr1[1]*facang12;
+  fk[2] = delr2[2]*(frad2+csfac2)-delr1[2]*facang12;
+
+  if (eflag) eng = facrad;
+}
diff --git a/src/MANYBODY/pair_sw_mod.h b/src/MANYBODY/pair_sw_mod.h
new file mode 100644
index 0000000000..580f031e00
--- /dev/null
+++ b/src/MANYBODY/pair_sw_mod.h
@@ -0,0 +1,94 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(sw/mod,PairSWMOD);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_SW_MOD_H
+#define LMP_PAIR_SW_MOD_H
+
+#include "pair_sw.h"
+
+namespace LAMMPS_NS {
+
+class PairSWMOD : public PairSW {
+ public:
+  PairSWMOD(class LAMMPS *);
+  virtual ~PairSWMOD() {}
+
+ protected:
+  double delta1;
+  double delta2;
+
+  void settings(int, char **);
+  void threebody(Param *, Param *, Param *, double, double, double *, double *, double *, double *,
+                 int, double &);
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair style Stillinger-Weber requires atom IDs
+
+This is a requirement to use the SW potential.
+
+E: Pair style Stillinger-Weber requires newton pair on
+
+See the newton command.  This is a restriction to use the SW
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open Stillinger-Weber potential file %s
+
+The specified SW potential file cannot be opened.  Check that the path
+and name are correct.
+
+E: Incorrect format in Stillinger-Weber potential file
+
+Incorrect number of words per line in the potential file.
+
+E: Illegal Stillinger-Weber parameter
+
+One or more of the coefficients defined in the potential file is
+invalid.
+
+E: Potential file has duplicate entry
+
+The potential file has more than one entry for the same element.
+
+E: Potential file is missing an entry
+
+The potential file does not have a needed entry.
+
+*/
diff --git a/src/MC/fix_charge_regulation.cpp b/src/MC/fix_charge_regulation.cpp
index 078e7eb9db..e469c8442d 100644
--- a/src/MC/fix_charge_regulation.cpp
+++ b/src/MC/fix_charge_regulation.cpp
@@ -985,9 +985,17 @@ int FixChargeRegulation::insert_particle(int ptype, double charge, double rd, do
     modify->create_attribute(m);
 
   }
-  atom->nghost = 0;
-  comm->borders();
   atom->natoms++;
+  atom->nghost = 0;
+  if (atom->tag_enable) {
+    if (atom->tag_enable) {
+      atom->tag_extend();
+      if (atom->map_style != Atom::MAP_NONE) atom->map_init();
+    }
+  }
+  if (triclinic) domain->x2lamda(atom->nlocal);
+  comm->borders();
+  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
   return m;
 }
 
@@ -1176,6 +1184,61 @@ double FixChargeRegulation::compute_vector(int n) {
   return 0.0;
 }
 
+
+/* ----------------------------------------------------------------------
+   pack entire state of Fix into one write
+------------------------------------------------------------------------- */
+
+void FixChargeRegulation::write_restart(FILE *fp)
+{
+  int n = 0;
+  double list[10];
+  list[n++] = random_equal->state();
+  list[n++] = random_unequal->state();
+  list[n++] = nacid_attempts;
+  list[n++] = nacid_successes;
+  list[n++] = nbase_attempts;
+  list[n++] = nbase_successes;
+  list[n++] = nsalt_attempts;
+  list[n++] = nsalt_successes;
+  list[n++] = ubuf(next_reneighbor).d;
+  list[n++] = ubuf(update->ntimestep).d;
+
+  if (comm->me == 0) {
+    int size = (int) sizeof(list);
+    fwrite(&size,sizeof(int),1,fp);
+    fwrite(list,sizeof(list),1,fp);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   use state info from restart file to restart the Fix
+------------------------------------------------------------------------- */
+
+void FixChargeRegulation::restart(char *buf)
+{
+  int n = 0;
+  double *list = (double *) buf;
+
+  seed = static_cast<int> (list[n++]);
+  random_equal->reset(seed);
+
+  seed = static_cast<int> (list[n++]);
+  random_unequal->reset(seed);
+
+  nacid_attempts  = list[n++];
+  nacid_successes = list[n++];
+  nbase_attempts  = list[n++];
+  nbase_successes = list[n++];
+  nsalt_attempts  = list[n++];
+  nsalt_successes = list[n++];
+
+  next_reneighbor = (bigint) ubuf(list[n++]).i;
+  bigint ntimestep_restart = (bigint) ubuf(list[n++]).i;
+  if (ntimestep_restart != update->ntimestep)
+    error->all(FLERR,"Must not reset timestep when restarting fix gcmc");
+}
+
 void FixChargeRegulation::setThermoTemperaturePointer() {
   int ifix = -1;
   ifix = modify->find_fix(idftemp);
diff --git a/src/MC/fix_charge_regulation.h b/src/MC/fix_charge_regulation.h
index 9fde2d0563..cf518baa03 100644
--- a/src/MC/fix_charge_regulation.h
+++ b/src/MC/fix_charge_regulation.h
@@ -53,6 +53,8 @@ class FixChargeRegulation : public Fix {
   void options(int, char **);
   void setThermoTemperaturePointer();
   double memory_usage();
+  void write_restart(FILE *);
+  void restart(char *);
 
  private:
   int exclusion_group, exclusion_group_bit;
diff --git a/src/MC/fix_gcmc.cpp b/src/MC/fix_gcmc.cpp
index 38007b8e2e..2149b03b6c 100644
--- a/src/MC/fix_gcmc.cpp
+++ b/src/MC/fix_gcmc.cpp
@@ -504,8 +504,7 @@ void FixGCMC::init()
     int flagall;
     MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
     if (flagall && comm->me == 0)
-      error->all(FLERR,
-       "Fix gcmc cannot exchange individual atoms belonging to a molecule");
+      error->all(FLERR, "Fix gcmc cannot exchange individual atoms belonging to a molecule");
   }
 
   // if molecules are exchanged or moved, check for unset mol IDs
@@ -520,16 +519,13 @@ void FixGCMC::init()
     int flagall;
     MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
     if (flagall && comm->me == 0)
-      error->all(FLERR,
-       "All mol IDs should be set for fix gcmc group atoms");
+      error->all(FLERR, "All mol IDs should be set for fix gcmc group atoms");
   }
 
   if (exchmode == EXCHMOL || movemode == MOVEMOL)
     if (atom->molecule_flag == 0 || !atom->tag_enable
         || (atom->map_style == Atom::MAP_NONE))
-      error->all(FLERR,
-       "Fix gcmc molecule command requires that "
-       "atoms have molecule attributes");
+      error->all(FLERR, "Fix gcmc molecule command requires that atoms have molecule attributes");
 
   // if rigidflag defined, check for rigid/small fix
   // its molecule template must be same as this one
@@ -541,9 +537,7 @@ void FixGCMC::init()
     fixrigid = modify->fix[ifix];
     int tmp;
     if (&onemols[imol] != (Molecule **) fixrigid->extract("onemol",tmp))
-      error->all(FLERR,
-                 "Fix gcmc and fix rigid/small not using "
-                 "same molecule template ID");
+      error->all(FLERR, "Fix gcmc and fix rigid/small not using same molecule template ID");
   }
 
   // if shakeflag defined, check for SHAKE fix
diff --git a/src/MC/fix_widom.cpp b/src/MC/fix_widom.cpp
index cc0ea7981e..e2989f8981 100644
--- a/src/MC/fix_widom.cpp
+++ b/src/MC/fix_widom.cpp
@@ -310,16 +310,13 @@ void FixWidom::init()
     int flagall;
     MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
     if (flagall && comm->me == 0)
-      error->all(FLERR,
-       "All mol IDs should be set for fix widom group atoms");
+      error->all(FLERR, "All mol IDs should be set for fix widom group atoms");
   }
 
   if (exchmode == EXCHMOL)
     if (atom->molecule_flag == 0 || !atom->tag_enable
         || (atom->map_style == Atom::MAP_NONE))
-      error->all(FLERR,
-       "Fix widom molecule command requires that "
-       "atoms have molecule attributes");
+      error->all(FLERR, "Fix widom molecule command requires that atoms have molecule attributes");
 
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot use fix widom in a 2d simulation");
diff --git a/src/ML-IAP/mliap_descriptor_so3.cpp b/src/ML-IAP/mliap_descriptor_so3.cpp
index e8bedea377..48748d8565 100644
--- a/src/ML-IAP/mliap_descriptor_so3.cpp
+++ b/src/ML-IAP/mliap_descriptor_so3.cpp
@@ -125,28 +125,27 @@ void MLIAPDescriptorSO3::read_paramfile(char *paramfilename)
 
     // check for keywords with one value per element
 
-    if (strcmp(skeywd.c_str(), "elems") == 0 || strcmp(skeywd.c_str(), "radelems") == 0 ||
-        strcmp(skeywd.c_str(), "welems") == 0) {
+    if ((skeywd == "elems") || (skeywd == "radelems") || (skeywd == "welems"))  {
 
       if (nelementsflag == 0 || nwords != nelements + 1)
         error->all(FLERR, "Incorrect SO3 parameter file");
 
-      if (strcmp(skeywd.c_str(), "elems") == 0) {
+      if (skeywd == "elems") {
         for (int ielem = 0; ielem < nelements; ielem++) {
           elements[ielem] = utils::strdup(skeyval);
           if (ielem < nelements - 1) skeyval = p.next();
         }
 
         elementsflag = 1;
-      } else if (strcmp(skeywd.c_str(), "radelems") == 0) {
+      } else if (skeywd == "radelems")  {
         for (int ielem = 0; ielem < nelements; ielem++) {
-          radelem[ielem] = utils::numeric(FLERR, skeyval.c_str(), false, lmp);
+          radelem[ielem] = utils::numeric(FLERR, skeyval, false, lmp);
           if (ielem < nelements - 1) skeyval = p.next();
         }
         radelemflag = 1;
-      } else if (strcmp(skeywd.c_str(), "welems") == 0) {
+      } else if (skeywd == "welems") {
         for (int ielem = 0; ielem < nelements; ielem++) {
-          wjelem[ielem] = utils::numeric(FLERR, skeyval.c_str(), false, lmp);
+          wjelem[ielem] = utils::numeric(FLERR, skeyval, false, lmp);
           if (ielem < nelements - 1) skeyval = p.next();
         }
         wjelemflag = 1;
@@ -158,23 +157,23 @@ void MLIAPDescriptorSO3::read_paramfile(char *paramfilename)
 
       if (nwords != 2) error->all(FLERR, "Incorrect SO3 parameter file");
 
-      if (strcmp(skeywd.c_str(), "nelems") == 0) {
-        nelements = utils::inumeric(FLERR, skeyval.c_str(), false, lmp);
+      if (skeywd == "nelems") {
+        nelements = utils::inumeric(FLERR, skeyval, false, lmp);
         elements = new char *[nelements];
         memory->create(radelem, nelements, "mliap_so3_descriptor:radelem");
         memory->create(wjelem, nelements, "mliap_so3_descriptor:wjelem");
         nelementsflag = 1;
-      } else if (strcmp(skeywd.c_str(), "rcutfac") == 0) {
-        rcutfac = utils::numeric(FLERR, skeyval.c_str(), false, lmp);
+      } else if (skeywd == "rcutfac") {
+        rcutfac = utils::numeric(FLERR, skeyval, false, lmp);
         rcutfacflag = 1;
-      } else if (strcmp(skeywd.c_str(), "nmax") == 0) {
-        nmax = utils::inumeric(FLERR, skeyval.c_str(), false, lmp);
+      } else if (skeywd == "nmax") {
+        nmax = utils::inumeric(FLERR, skeyval, false, lmp);
         nmaxflag = 1;
-      } else if (strcmp(skeywd.c_str(), "lmax") == 0) {
-        lmax = utils::inumeric(FLERR, skeyval.c_str(), false, lmp);
+      } else if (skeywd == "lmax") {
+        lmax = utils::inumeric(FLERR, skeyval, false, lmp);
         lmaxflag = 1;
-      } else if (strcmp(skeywd.c_str(), "alpha") == 0) {
-        alpha = utils::numeric(FLERR, skeyval.c_str(), false, lmp);
+      } else if (skeywd == "alpha") {
+        alpha = utils::numeric(FLERR, skeyval, false, lmp);
         alphaflag = 1;
       } else
         error->all(FLERR, "Incorrect SO3 parameter file");
diff --git a/src/ML-RANN/pair_rann.cpp b/src/ML-RANN/pair_rann.cpp
index 3e97b638dc..97a3478332 100644
--- a/src/ML-RANN/pair_rann.cpp
+++ b/src/ML-RANN/pair_rann.cpp
@@ -440,7 +440,7 @@ void PairRANN::read_mass(const std::vector<std::string> &line1, const std::vecto
   if (nelements == -1)error->one(filename,linenum-1,"atom types must be defined before mass in potential file.");
   for (int i=0;i<nelements;i++) {
     if (line1[1].compare(elements[i])==0) {
-      mass[i]=utils::numeric(filename,linenum,line2[0].c_str(),true,lmp);
+      mass[i]=utils::numeric(filename,linenum,line2[0],true,lmp);
       return;
     }
   }
@@ -452,7 +452,7 @@ void PairRANN::read_fpe(std::vector<std::string> line,std::vector<std::string> l
   if (nelements == -1)error->one(filename,linenum-1,"atom types must be defined before fingerprints per element in potential file.");
   for (i=0;i<nelementsp;i++) {
     if (line[1].compare(elementsp[i])==0) {
-      fingerprintperelement[i] = utils::inumeric(filename,linenum,line1[0].c_str(),true,lmp);
+      fingerprintperelement[i] = utils::inumeric(filename,linenum,line1[0],true,lmp);
       fingerprints[i] = new RANN::Fingerprint *[fingerprintperelement[i]];
       for (int j=0;j<fingerprintperelement[i];j++) {
         fingerprints[i][j]=new RANN::Fingerprint(this);
@@ -491,7 +491,7 @@ void PairRANN::read_fingerprints(std::vector<std::string> line,std::vector<std::
     fingerprints[i][i1] = create_fingerprint(line1[k].c_str());
     if (fingerprints[i][i1]->n_body_type!=nwords-1) {error->one(filename,linenum,"invalid fingerprint for element combination");}
     k++;
-    fingerprints[i][i1]->init(atomtypes,utils::inumeric(filename,linenum,line1[k++].c_str(),true,lmp));
+    fingerprints[i][i1]->init(atomtypes,utils::inumeric(filename,linenum,line1[k++],true,lmp));
     fingerprintcount[i]++;
   }
   delete[] atomtypes;
@@ -523,7 +523,7 @@ void PairRANN::read_fingerprint_constants(std::vector<std::string> line,std::vec
     for (j=0;j<n_body_type;j++) {
       if (fingerprints[i][k]->atomtypes[j]!=atomtypes[j]) {break;}
       if (j==n_body_type-1) {
-        if (line[nwords-3].compare(fingerprints[i][k]->style)==0 && utils::inumeric(filename,linenum,line[nwords-2].c_str(),true,lmp)==fingerprints[i][k]->id) {
+        if (line[nwords-3].compare(fingerprints[i][k]->style)==0 && utils::inumeric(filename,linenum,line[nwords-2],true,lmp)==fingerprints[i][k]->id) {
           found=true;
           i1 = k;
           break;
@@ -542,7 +542,7 @@ void PairRANN::read_network_layers(std::vector<std::string> line,std::vector<std
   if (nelements == -1)error->one(filename,linenum-1,"atom types must be defined before network layers in potential file.");
   for (i=0;i<nelements;i++) {
     if (line[1].compare(elements[i])==0) {
-      net[i].layers = utils::inumeric(filename,linenum,line1[0].c_str(),true,lmp);
+      net[i].layers = utils::inumeric(filename,linenum,line1[0],true,lmp);
       if (net[i].layers < 1)error->one(filename,linenum,"invalid number of network layers");
       delete[] net[i].dimensions;
       weightdefined[i] = new bool [net[i].layers];
@@ -570,9 +570,9 @@ void PairRANN::read_layer_size(std::vector<std::string> line,std::vector<std::st
   for (i=0;i<nelements;i++) {
     if (line[1].compare(elements[i])==0) {
       if (net[i].layers==0)error->one(filename,linenum-1,"networklayers for each atom type must be defined before the corresponding layer sizes.");
-      int j = utils::inumeric(filename,linenum,line[2].c_str(),true,lmp);
+      int j = utils::inumeric(filename,linenum,line[2],true,lmp);
       if (j>=net[i].layers || j<0) {error->one(filename,linenum,"invalid layer in layer size definition");};
-      net[i].dimensions[j]= utils::inumeric(filename,linenum,line1[0].c_str(),true,lmp);
+      net[i].dimensions[j]= utils::inumeric(filename,linenum,line1[0],true,lmp);
       return;
     }
   }
@@ -587,7 +587,7 @@ void PairRANN::read_weight(std::vector<std::string> line,std::vector<std::string
   for (l=0;l<nelements;l++) {
     if (line[1].compare(elements[l])==0) {
       if (net[l].layers==0)error->one(filename,*linenum-1,"networklayers must be defined before weights.");
-      i=utils::inumeric(filename,*linenum,line[2].c_str(),true,lmp);
+      i=utils::inumeric(filename,*linenum,line[2],true,lmp);
       if (i>=net[l].layers || i<0)error->one(filename,*linenum-1,"invalid weight layer");
       if (net[l].dimensions[i]==0 || net[l].dimensions[i+1]==0) error->one(filename,*linenum-1,"network layer sizes must be defined before corresponding weight");
       net[l].Weights[i] = new double[net[l].dimensions[i]*net[l].dimensions[i+1]];
@@ -595,7 +595,7 @@ void PairRANN::read_weight(std::vector<std::string> line,std::vector<std::string
       nwords = line1.size();
       if (nwords != net[l].dimensions[i])error->one(filename,*linenum,"invalid weights per line");
       for (k=0;k<net[l].dimensions[i];k++) {
-        net[l].Weights[i][k] = utils::numeric(filename,*linenum,line1[k].c_str(),true,lmp);
+        net[l].Weights[i][k] = utils::numeric(filename,*linenum,line1[k],true,lmp);
       }
       for (j=1;j<net[l].dimensions[i+1];j++) {
         ptr = fgets(linetemp,longline,fp);
@@ -606,7 +606,7 @@ void PairRANN::read_weight(std::vector<std::string> line,std::vector<std::string
         nwords = line1.size();
         if (nwords != net[l].dimensions[i])error->one(filename,*linenum,"invalid weights per line");
         for (k=0;k<net[l].dimensions[i];k++) {
-          net[l].Weights[i][j*net[l].dimensions[i]+k] = utils::numeric(filename,*linenum,line1[k].c_str(),true,lmp);
+          net[l].Weights[i][j*net[l].dimensions[i]+k] = utils::numeric(filename,*linenum,line1[k],true,lmp);
         }
       }
       return;
@@ -621,19 +621,19 @@ void PairRANN::read_bias(std::vector<std::string> line,std::vector<std::string>
   for (l=0;l<nelements;l++) {
     if (line[1].compare(elements[l])==0) {
       if (net[l].layers==0)error->one(filename,*linenum-1,"networklayers must be defined before biases.");
-      i=utils::inumeric(filename,*linenum,line[2].c_str(),true,lmp);
+      i=utils::inumeric(filename,*linenum,line[2],true,lmp);
       if (i>=net[l].layers || i<0)error->one(filename,*linenum-1,"invalid bias layer");
       if (net[l].dimensions[i]==0) error->one(filename,*linenum-1,"network layer sizes must be defined before corresponding bias");
       biasdefined[l][i] = true;
       net[l].Biases[i] = new double[net[l].dimensions[i+1]];
-      net[l].Biases[i][0] = utils::numeric(filename,*linenum,line1[0].c_str(),true,lmp);
+      net[l].Biases[i][0] = utils::numeric(filename,*linenum,line1[0],true,lmp);
       for (j=1;j<net[l].dimensions[i+1];j++) {
         ptr=fgets(linetemp,MAXLINE,fp);
         if (ptr==nullptr)error->one(filename,*linenum,"unexpected end of potential file!");
         (*linenum)++;
         Tokenizer values1 = Tokenizer(linetemp,": ,\t_\n");
         line1 = values1.as_vector();
-        net[l].Biases[i][j] = utils::numeric(filename,*linenum,line1[0].c_str(),true,lmp);
+        net[l].Biases[i][j] = utils::numeric(filename,*linenum,line1[0],true,lmp);
       }
       return;
     }
@@ -680,10 +680,10 @@ void PairRANN::read_screening(std::vector<std::string> line,std::vector<std::str
   k = atomtypes[2];
   int index = i*nelements*nelements+j*nelements+k;
   if (line[4].compare("Cmin")==0)  {
-    screening_min[index] = utils::numeric(filename,linenum,line1[0].c_str(),true,lmp);
+    screening_min[index] = utils::numeric(filename,linenum,line1[0],true,lmp);
   }
   else if (line[4].compare("Cmax")==0) {
-    screening_max[index] = utils::numeric(filename,linenum,line1[0].c_str(),true,lmp);
+    screening_max[index] = utils::numeric(filename,linenum,line1[0],true,lmp);
   }
   else error->one(filename,linenum-1,"unrecognized screening keyword");
   delete[] atomtypes;
diff --git a/src/ML-SNAP/pair_snap.cpp b/src/ML-SNAP/pair_snap.cpp
index 1eb078bc61..eafb27f5ba 100644
--- a/src/ML-SNAP/pair_snap.cpp
+++ b/src/ML-SNAP/pair_snap.cpp
@@ -570,8 +570,8 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
     else
       elementflags[jelem] = 1;
 
-    radelem[jelem] = utils::numeric(FLERR,words[1].c_str(),false,lmp);
-    wjelem[jelem] = utils::numeric(FLERR,words[2].c_str(),false,lmp);
+    radelem[jelem] = utils::numeric(FLERR,words[1],false,lmp);
+    wjelem[jelem] = utils::numeric(FLERR,words[2],false,lmp);
 
     if (comm->me == 0)
       utils::logmesg(lmp,"SNAP Element = {}, Radius {}, Weight {}\n",
@@ -672,34 +672,33 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
       utils::logmesg(lmp,"SNAP keyword {} {}\n",keywd,keyval);
 
     if (keywd == "rcutfac") {
-      rcutfac = utils::numeric(FLERR,keyval.c_str(),false,lmp);
+      rcutfac = utils::numeric(FLERR,keyval,false,lmp);
       rcutfacflag = 1;
     } else if (keywd == "twojmax") {
-      twojmax = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      twojmax = utils::inumeric(FLERR,keyval,false,lmp);
       twojmaxflag = 1;
     } else if (keywd == "rfac0")
-      rfac0 = utils::numeric(FLERR,keyval.c_str(),false,lmp);
+      rfac0 = utils::numeric(FLERR,keyval,false,lmp);
     else if (keywd == "rmin0")
-      rmin0 = utils::numeric(FLERR,keyval.c_str(),false,lmp);
+      rmin0 = utils::numeric(FLERR,keyval,false,lmp);
     else if (keywd == "switchflag")
-      switchflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      switchflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "bzeroflag")
-      bzeroflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      bzeroflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "quadraticflag")
-      quadraticflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      quadraticflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "chemflag")
-      chemflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      chemflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "bnormflag")
-      bnormflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      bnormflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "wselfallflag")
-      wselfallflag = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      wselfallflag = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "chunksize")
-      chunksize = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      chunksize = utils::inumeric(FLERR,keyval,false,lmp);
     else if (keywd == "parallelthresh")
-      parallel_thresh = utils::inumeric(FLERR,keyval.c_str(),false,lmp);
+      parallel_thresh = utils::inumeric(FLERR,keyval,false,lmp);
     else
-      error->all(FLERR,"Unknown parameter '{}' in SNAP "
-                                   "parameter file", keywd);
+      error->all(FLERR,"Unknown parameter '{}' in SNAP parameter file", keywd);
   }
 
   if (rcutfacflag == 0 || twojmaxflag == 0)
diff --git a/src/MOFFF/angle_class2_p6.cpp b/src/MOFFF/angle_class2_p6.cpp
index 0fccf6c4cb..e72f9f34fc 100644
--- a/src/MOFFF/angle_class2_p6.cpp
+++ b/src/MOFFF/angle_class2_p6.cpp
@@ -174,6 +174,8 @@ void AngleClass2P6::compute(int eflag, int vflag)
 
     // force & energy for bond-angle term
 
+    dr1 = r1 - ba_r1[type];
+    dr2 = r2 - ba_r2[type];
     aa1 = s * dr1 * ba_k1[type];
     aa2 = s * dr2 * ba_k2[type];
 
@@ -479,6 +481,9 @@ double AngleClass2P6::single(int type, int i1, int i2, int i3)
   double dr2 = r2 - bb_r2[type];
   energy += bb_k[type]*dr1*dr2;
 
+  dr1 = r1 - ba_r1[type];
+  dr2 = r2 - ba_r2[type];
   energy += ba_k1[type]*dr1*dtheta + ba_k2[type]*dr2*dtheta;
+
   return energy;
 }
diff --git a/src/MOLECULE/angle_table.cpp b/src/MOLECULE/angle_table.cpp
index a5ba07b779..2d7356db01 100644
--- a/src/MOLECULE/angle_table.cpp
+++ b/src/MOLECULE/angle_table.cpp
@@ -470,9 +470,9 @@ void AngleTable::compute_table(Table *tb)
 
   memory->create(tb->ang,tablength,"angle:ang");
   memory->create(tb->e,tablength,"angle:e");
-  memory->create(tb->de,tlm1,"angle:de");
+  memory->create(tb->de,tablength,"angle:de");
   memory->create(tb->f,tablength,"angle:f");
-  memory->create(tb->df,tlm1,"angle:df");
+  memory->create(tb->df,tablength,"angle:df");
   memory->create(tb->e2,tablength,"angle:e2");
   memory->create(tb->f2,tablength,"angle:f2");
 
@@ -488,6 +488,9 @@ void AngleTable::compute_table(Table *tb)
     tb->de[i] = tb->e[i+1] - tb->e[i];
     tb->df[i] = tb->f[i+1] - tb->f[i];
   }
+  // get final elements from linear extrapolation
+  tb->de[tlm1] = 2.0*tb->de[tlm1-1] - tb->de[tlm1-2];
+  tb->df[tlm1] = 2.0*tb->df[tlm1-1] - tb->df[tlm1-2];
 
   double ep0 = - tb->f[0];
   double epn = - tb->f[tlm1];
@@ -575,7 +578,7 @@ void AngleTable::spline(double *x, double *y, int n,
   double p,qn,sig,un;
   double *u = new double[n];
 
-  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
+  if (yp1 > 0.99e300) y2[0] = u[0] = 0.0;
   else {
     y2[0] = -0.5;
     u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
@@ -587,7 +590,7 @@ void AngleTable::spline(double *x, double *y, int n,
     u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
     u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
   }
-  if (ypn > 0.99e30) qn = un = 0.0;
+  if (ypn > 0.99e300) qn = un = 0.0;
   else {
     qn = 0.5;
     un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
@@ -615,8 +618,7 @@ double AngleTable::splint(double *xa, double *ya, double *y2a, int n, double x)
   h = xa[khi]-xa[klo];
   a = (xa[khi]-x) / h;
   b = (x-xa[klo]) / h;
-  y = a*ya[klo] + b*ya[khi] +
-    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
+  y = a*ya[klo] + b*ya[khi] + ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
   return y;
 }
 
@@ -632,8 +634,9 @@ void AngleTable::uf_lookup(int type, double x, double &u, double &f)
 
   double fraction,a,b;
   const Table *tb = &tables[tabindex[type]];
-  int itable = static_cast<int> (x * tb->invdelta);
 
+  // invdelta is based on tablength-1
+  int itable = static_cast<int> (x * tb->invdelta);
   if (itable < 0) itable = 0;
   if (itable >= tablength) itable = tablength-1;
 
@@ -647,11 +650,9 @@ void AngleTable::uf_lookup(int type, double x, double &u, double &f)
     b = (x - tb->ang[itable]) * tb->invdelta;
     a = 1.0 - b;
     u = a * tb->e[itable] + b * tb->e[itable+1] +
-      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
-      tb->deltasq6;
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
     f = a * tb->f[itable] + b * tb->f[itable+1] +
-      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
-      tb->deltasq6;
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) * tb->deltasq6;
   }
 }
 
@@ -681,7 +682,6 @@ void AngleTable::u_lookup(int type, double x, double &u)
     b = (x - tb->ang[itable]) * tb->invdelta;
     a = 1.0 - b;
     u = a * tb->e[itable] + b * tb->e[itable+1] +
-      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
-      tb->deltasq6;
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
   }
 }
diff --git a/src/MOLECULE/bond_table.cpp b/src/MOLECULE/bond_table.cpp
index db1314c76f..48d7377682 100644
--- a/src/MOLECULE/bond_table.cpp
+++ b/src/MOLECULE/bond_table.cpp
@@ -435,9 +435,9 @@ void BondTable::compute_table(Table *tb)
 
   memory->create(tb->r,tablength,"bond:r");
   memory->create(tb->e,tablength,"bond:e");
-  memory->create(tb->de,tlm1,"bond:de");
+  memory->create(tb->de,tablength,"bond:de");
   memory->create(tb->f,tablength,"bond:f");
-  memory->create(tb->df,tlm1,"bond:df");
+  memory->create(tb->df,tablength,"bond:df");
   memory->create(tb->e2,tablength,"bond:e2");
   memory->create(tb->f2,tablength,"bond:f2");
 
@@ -453,6 +453,9 @@ void BondTable::compute_table(Table *tb)
     tb->de[i] = tb->e[i+1] - tb->e[i];
     tb->df[i] = tb->f[i+1] - tb->f[i];
   }
+  // get final elements from linear extrapolation
+  tb->de[tlm1] = 2.0*tb->de[tlm1-1] - tb->de[tlm1-2];
+  tb->df[tlm1] = 2.0*tb->df[tlm1-1] - tb->df[tlm1-2];
 
   double ep0 = - tb->f[0];
   double epn = - tb->f[tlm1];
@@ -538,7 +541,7 @@ void BondTable::spline(double *x, double *y, int n,
   double p,qn,sig,un;
   double *u = new double[n];
 
-  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
+  if (yp1 > 0.99e300) y2[0] = u[0] = 0.0;
   else {
     y2[0] = -0.5;
     u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
@@ -550,7 +553,7 @@ void BondTable::spline(double *x, double *y, int n,
     u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
     u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
   }
-  if (ypn > 0.99e30) qn = un = 0.0;
+  if (ypn > 0.99e300) qn = un = 0.0;
   else {
     qn = 0.5;
     un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
@@ -578,8 +581,7 @@ double BondTable::splint(double *xa, double *ya, double *y2a, int n, double x)
   h = xa[khi]-xa[klo];
   a = (xa[khi]-x) / h;
   b = (x-xa[klo]) / h;
-  y = a*ya[klo] + b*ya[khi] +
-    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
+  y = a*ya[klo] + b*ya[khi] + ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
   return y;
 }
 
@@ -598,11 +600,9 @@ void BondTable::uf_lookup(int type, double x, double &u, double &f)
   const Table *tb = &tables[tabindex[type]];
   const int itable = static_cast<int> ((x - tb->lo) * tb->invdelta);
   if (itable < 0)
-    error->one(FLERR,"Bond length < table inner cutoff: "
-               "type {} length {:.8}",type,x);
+    error->one(FLERR,"Bond length < table inner cutoff: type {} length {:.8}",type,x);
   else if (itable >= tablength)
-    error->one(FLERR,"Bond length > table outer cutoff: "
-               "type {} length {:.8}",type,x);
+    error->one(FLERR,"Bond length > table outer cutoff: type {} length {:.8}",type,x);
 
   if (tabstyle == LINEAR) {
     fraction = (x - tb->r[itable]) * tb->invdelta;
@@ -614,10 +614,8 @@ void BondTable::uf_lookup(int type, double x, double &u, double &f)
     b = (x - tb->r[itable]) * tb->invdelta;
     a = 1.0 - b;
     u = a * tb->e[itable] + b * tb->e[itable+1] +
-      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
-      tb->deltasq6;
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
     f = a * tb->f[itable] + b * tb->f[itable+1] +
-      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
-      tb->deltasq6;
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) * tb->deltasq6;
   }
 }
diff --git a/src/MOLECULE/dihedral_table.cpp b/src/MOLECULE/dihedral_table.cpp
index a91324dd98..dbca4a85c1 100644
--- a/src/MOLECULE/dihedral_table.cpp
+++ b/src/MOLECULE/dihedral_table.cpp
@@ -189,11 +189,8 @@ static int solve_cyc_tridiag( const double diag[], size_t d_stride,
    spline and splint routines modified from Numerical Recipes
 ------------------------------------------------------------------------- */
 
-static int cyc_spline(double const *xa,
-                      double const *ya,
-                      int n,
-                      double period,
-                      double *y2a, bool warn)
+static int cyc_spline(double const *xa, double const *ya, int n,
+                      double period, double *y2a, bool warn)
 {
 
   double *diag    = new double[n];
@@ -264,12 +261,8 @@ static int cyc_spline(double const *xa,
 //           range should not exceed period (ie xa[n-1] < xa[0] + period).
 //           x must lie in the range:  [(xa[n-1]-period), (xa[0]+period)]
 //           "period" is typically 2*PI.
-static double cyc_splint(double const *xa,
-                         double const *ya,
-                         double const *y2a,
-                         int n,
-                         double period,
-                         double x)
+static double cyc_splint(double const *xa, double const *ya, double const *y2a,
+                         int n, double period, double x)
 {
   int klo = -1;
   int khi = n;
@@ -302,11 +295,8 @@ static double cyc_splint(double const *xa,
 } // cyc_splint()
 
 
-static double cyc_lin(double const *xa,
-                      double const *ya,
-                      int n,
-                      double period,
-                      double x)
+static double cyc_lin(double const *xa, double const *ya,
+                      int n, double period, double x)
 {
   int klo = -1;
   int khi = n;
@@ -337,21 +327,14 @@ static double cyc_lin(double const *xa,
 
 } // cyc_lin()
 
-
-
-
 // cyc_splintD(): Evaluate the deriviative of a cyclic spline at position x,
 //           with n control points at xa[], ya[], with parameters y2a[].
 //           The xa[] must be monotonically increasing and their
 //           range should not exceed period (ie xa[n-1] < xa[0] + period).
 //           x must lie in the range:  [(xa[n-1]-period), (xa[0]+period)]
 //           "period" is typically 2*PI.
-static double cyc_splintD(double const *xa,
-                          double const *ya,
-                          double const *y2a,
-                          int n,
-                          double period,
-                          double x)
+static double cyc_splintD(double const *xa, double const *ya, double const *y2a,
+                          int n, double period, double x)
 {
   int klo = -1;
   int khi = n; // (not n-1)
@@ -829,9 +812,9 @@ void DihedralTable::coeff(int narg, char **arg)
   // We also want the angles to be sorted in increasing order.
   // This messy code fixes these problems with the user's data:
   {
-    double *phifile_tmp = new double [tb->ninput];  //temporary arrays
-    double *ffile_tmp = new double [tb->ninput];  //used for sorting
-    double *efile_tmp = new double [tb->ninput];
+    double *phifile_tmp = new double[tb->ninput];  //temporary arrays
+    double *ffile_tmp = new double[tb->ninput];  //used for sorting
+    double *efile_tmp = new double[tb->ninput];
 
     // After re-imaging, does the range of angles cross the 0 or 2*PI boundary?
     // If so, find the discontinuity:
@@ -1184,8 +1167,7 @@ void DihedralTable::compute_table(Table *tb)
       if (! tb->f_unspecified)
         tb->f[i] = cyc_splint(tb->phifile,tb->ffile,tb->f2file,tb->ninput,MY_2PI,phi);
     }
-  } // if (tabstyle == SPLINE)
-  else if (tabstyle == LINEAR) {
+  } else if (tabstyle == LINEAR) {
     if (! tb->f_unspecified) {
       for (int i = 0; i < tablength; i++) {
         double phi = i*tb->delta;
@@ -1193,8 +1175,7 @@ void DihedralTable::compute_table(Table *tb)
         tb->e[i]= cyc_lin(tb->phifile,tb->efile,tb->ninput,MY_2PI,phi);
         tb->f[i]= cyc_lin(tb->phifile,tb->ffile,tb->ninput,MY_2PI,phi);
       }
-    }
-    else {
+    } else {
       for (int i = 0; i < tablength; i++) {
         double phi = i*tb->delta;
         tb->phi[i] = phi;
@@ -1269,8 +1250,7 @@ void DihedralTable::param_extract(Table *tb, char *line)
       //else if (word == "EQ") {
       //  tb->theta0 = values.next_double();
       //}
-      else error->one(FLERR,"Invalid keyword in dihedral angle "
-                                        "table parameters ({})", word);
+      else error->one(FLERR,"Invalid keyword in dihedral angle table parameters ({})", word);
     }
   } catch (TokenizerException &e) {
     error->one(FLERR, e.what());
diff --git a/src/MOLECULE/fix_cmap.cpp b/src/MOLECULE/fix_cmap.cpp
index 26a0ab2542..a763c5d14c 100644
--- a/src/MOLECULE/fix_cmap.cpp
+++ b/src/MOLECULE/fix_cmap.cpp
@@ -1072,10 +1072,10 @@ void FixCMAP::read_data_header(char *line)
    store CMAP interactions as if newton_bond = OFF, even if actually ON
 ------------------------------------------------------------------------- */
 
-void FixCMAP::read_data_section(char *keyword, int n, char *buf,
+void FixCMAP::read_data_section(char * /*keyword*/, int /*n*/, char *buf,
                                  tagint id_offset)
 {
-  int m,tmp,itype;
+  int m,itype;
   tagint atom1,atom2,atom3,atom4,atom5;
 
   auto lines = utils::split_lines(buf);
diff --git a/src/MOLFILE/molfile_interface.cpp b/src/MOLFILE/molfile_interface.cpp
index 9ce3822082..5fd398570e 100644
--- a/src/MOLFILE/molfile_interface.cpp
+++ b/src/MOLFILE/molfile_interface.cpp
@@ -75,25 +75,25 @@ extern "C" {
 
   /* corresponding table of masses. */
   static const float pte_mass[] = {
-    /* X  */ 0.00000, 1.00794, 4.00260, 6.941, 9.012182, 10.811,
-    /* C  */ 12.0107, 14.0067, 15.9994, 18.9984032, 20.1797,
-    /* Na */ 22.989770, 24.3050, 26.981538, 28.0855, 30.973761,
-    /* S  */ 32.065, 35.453, 39.948, 39.0983, 40.078, 44.955910,
-    /* Ti */ 47.867, 50.9415, 51.9961, 54.938049, 55.845, 58.9332,
-    /* Ni */ 58.6934, 63.546, 65.409, 69.723, 72.64, 74.92160,
-    /* Se */ 78.96, 79.904, 83.798, 85.4678, 87.62, 88.90585,
-    /* Zr */ 91.224, 92.90638, 95.94, 98.0, 101.07, 102.90550,
-    /* Pd */ 106.42, 107.8682, 112.411, 114.818, 118.710, 121.760,
-    /* Te */ 127.60, 126.90447, 131.293, 132.90545, 137.327,
-    /* La */ 138.9055, 140.116, 140.90765, 144.24, 145.0, 150.36,
-    /* Eu */ 151.964, 157.25, 158.92534, 162.500, 164.93032,
-    /* Er */ 167.259, 168.93421, 173.04, 174.967, 178.49, 180.9479,
-    /* W  */ 183.84, 186.207, 190.23, 192.217, 195.078, 196.96655,
-    /* Hg */ 200.59, 204.3833, 207.2, 208.98038, 209.0, 210.0, 222.0,
-    /* Fr */ 223.0, 226.0, 227.0, 232.0381, 231.03588, 238.02891,
-    /* Np */ 237.0, 244.0, 243.0, 247.0, 247.0, 251.0, 252.0, 257.0,
-    /* Md */ 258.0, 259.0, 262.0, 261.0, 262.0, 266.0, 264.0, 269.0,
-    /* Mt */ 268.0, 271.0, 272.0
+    /* X  */ 0.00000f, 1.00794f, 4.00260f, 6.941f, 9.012182f, 10.811f,
+    /* C  */ 12.0107f, 14.0067f, 15.9994f, 18.9984032f, 20.1797f,
+    /* Na */ 22.989770f, 24.3050f, 26.981538f, 28.0855f, 30.973761f,
+    /* S  */ 32.065f, 35.453f, 39.948f, 39.0983f, 40.078f, 44.955910f,
+    /* Ti */ 47.867f, 50.9415f, 51.9961f, 54.938049f, 55.845f, 58.9332f,
+    /* Ni */ 58.6934f, 63.546f, 65.409f, 69.723f, 72.64f, 74.92160f,
+    /* Se */ 78.96f, 79.904f, 83.798f, 85.4678f, 87.62f, 88.90585f,
+    /* Zr */ 91.224f, 92.90638f, 95.94f, 98.0f, 101.07f, 102.90550f,
+    /* Pd */ 106.42f, 107.8682f, 112.411f, 114.818f, 118.710f, 121.760f,
+    /* Te */ 127.60f, 126.90447f, 131.293f, 132.90545f, 137.327f,
+    /* La */ 138.9055f, 140.116f, 140.90765f, 144.24f, 145.0f, 150.36f,
+    /* Eu */ 151.964f, 157.25f, 158.92534f, 162.500f, 164.93032f,
+    /* Er */ 167.259f, 168.93421f, 173.04f, 174.967f, 178.49f, 180.9479f,
+    /* W  */ 183.84f, 186.207f, 190.23f, 192.217f, 195.078f, 196.96655f,
+    /* Hg */ 200.59f, 204.3833f, 207.2f, 208.98038f, 209.0f, 210.0f, 222.0f,
+    /* Fr */ 223.0f, 226.0f, 227.0f, 232.0381f, 231.03588f, 238.02891f,
+    /* Np */ 237.0f, 244.0f, 243.0f, 247.0f, 247.0f, 251.0f, 252.0f, 257.0f,
+    /* Md */ 258.0f, 259.0f, 262.0f, 261.0f, 262.0f, 266.0f, 264.0f, 269.0f,
+    /* Mt */ 268.0f, 271.0f, 272.0f
   };
 
   /*
@@ -107,25 +107,25 @@ extern "C" {
    * Rmin/2 parameters for (SOD, POT, CLA, CAL, MG, CES) by default.
    */
   static const float pte_vdw_radius[] = {
-    /* X  */ 1.5, 1.2, 1.4, 1.82, 2.0, 2.0,
-    /* C  */ 1.7, 1.55, 1.52, 1.47, 1.54,
-    /* Na */ 1.36, 1.18, 2.0, 2.1, 1.8,
-    /* S  */ 1.8, 2.27, 1.88, 1.76, 1.37, 2.0,
-    /* Ti */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Ni */ 1.63, 1.4, 1.39, 1.07, 2.0, 1.85,
-    /* Se */ 1.9, 1.85, 2.02, 2.0, 2.0, 2.0,
-    /* Zr */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Pd */ 1.63, 1.72, 1.58, 1.93, 2.17, 2.0,
-    /* Te */ 2.06, 1.98, 2.16, 2.1, 2.0,
-    /* La */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Eu */ 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Er */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* W  */ 2.0, 2.0, 2.0, 2.0, 1.72, 1.66,
-    /* Hg */ 1.55, 1.96, 2.02, 2.0, 2.0, 2.0, 2.0,
-    /* Fr */ 2.0, 2.0, 2.0, 2.0, 2.0, 1.86,
-    /* Np */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Md */ 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
-    /* Mt */ 2.0, 2.0, 2.0
+    /* X  */ 1.5f, 1.2f, 1.4f, 1.82f, 2.0f, 2.0f,
+    /* C  */ 1.7f, 1.55f, 1.52f, 1.47f, 1.54f,
+    /* Na */ 1.36f, 1.18f, 2.0f, 2.1f, 1.8f,
+    /* S  */ 1.8f, 2.27f, 1.88f, 1.76f, 1.37f, 2.0f,
+    /* Ti */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Ni */ 1.63f, 1.4f, 1.39f, 1.07f, 2.0f, 1.85f,
+    /* Se */ 1.9f, 1.85f, 2.02f, 2.0f, 2.0f, 2.0f,
+    /* Zr */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Pd */ 1.63f, 1.72f, 1.58f, 1.93f, 2.17f, 2.0f,
+    /* Te */ 2.06f, 1.98f, 2.16f, 2.1f, 2.0f,
+    /* La */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Eu */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Er */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* W  */ 2.0f, 2.0f, 2.0f, 2.0f, 1.72f, 1.66f,
+    /* Hg */ 1.55f, 1.96f, 2.02f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Fr */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.86f,
+    /* Np */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Md */ 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+    /* Mt */ 2.0f, 2.0f, 2.0f
   };
 
   /* lookup functions */
diff --git a/src/MPIIO/dump_atom_mpiio.cpp b/src/MPIIO/dump_atom_mpiio.cpp
index dc3dffbf80..522950ab8b 100644
--- a/src/MPIIO/dump_atom_mpiio.cpp
+++ b/src/MPIIO/dump_atom_mpiio.cpp
@@ -38,7 +38,12 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-DumpAtomMPIIO::DumpAtomMPIIO(LAMMPS *lmp, int narg, char **arg) : DumpAtom(lmp, narg, arg) {}
+DumpAtomMPIIO::DumpAtomMPIIO(LAMMPS *lmp, int narg, char **arg)
+  : DumpAtom(lmp, narg, arg)
+{
+  if (me == 0)
+    error->warning(FLERR,"MPI-IO output is unmaintained and unreliable. Use with caution.");
+}
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/MPIIO/dump_cfg_mpiio.cpp b/src/MPIIO/dump_cfg_mpiio.cpp
index cd0f1bfaaf..978709b787 100644
--- a/src/MPIIO/dump_cfg_mpiio.cpp
+++ b/src/MPIIO/dump_cfg_mpiio.cpp
@@ -51,7 +51,11 @@ using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 
 DumpCFGMPIIO::DumpCFGMPIIO(LAMMPS *lmp, int narg, char **arg) :
-  DumpCFG(lmp, narg, arg) {}
+  DumpCFG(lmp, narg, arg)
+{
+  if (me == 0)
+    error->warning(FLERR,"MPI-IO output is unmaintained and unreliable. Use with caution.");
+}
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/MPIIO/dump_custom_mpiio.cpp b/src/MPIIO/dump_custom_mpiio.cpp
index 5e7ce7dbb7..68c5896361 100644
--- a/src/MPIIO/dump_custom_mpiio.cpp
+++ b/src/MPIIO/dump_custom_mpiio.cpp
@@ -53,7 +53,12 @@ enum{ LT, LE, GT, GE, EQ, NEQ };
 // clang-format on
 /* ---------------------------------------------------------------------- */
 
-DumpCustomMPIIO::DumpCustomMPIIO(LAMMPS *lmp, int narg, char **arg) : DumpCustom(lmp, narg, arg) {}
+DumpCustomMPIIO::DumpCustomMPIIO(LAMMPS *lmp, int narg, char **arg)
+  : DumpCustom(lmp, narg, arg)
+{
+  if (me == 0)
+    error->warning(FLERR,"MPI-IO output is unmaintained and unreliable. Use with caution.");
+}
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/MPIIO/dump_xyz_mpiio.cpp b/src/MPIIO/dump_xyz_mpiio.cpp
index f322a0da58..e4bfe4ef13 100644
--- a/src/MPIIO/dump_xyz_mpiio.cpp
+++ b/src/MPIIO/dump_xyz_mpiio.cpp
@@ -52,7 +52,10 @@ enum{LT,LE,GT,GE,EQ,NEQ};
 /* ---------------------------------------------------------------------- */
 
 DumpXYZMPIIO::DumpXYZMPIIO(LAMMPS *lmp, int narg, char **arg) :
-  DumpXYZ(lmp, narg, arg) {}
+  DumpXYZ(lmp, narg, arg) {
+  if (me == 0)
+    error->warning(FLERR,"MPI-IO output is unmaintained and unreliable. Use with caution.");
+}
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/NETCDF/dump_netcdf.cpp b/src/NETCDF/dump_netcdf.cpp
index 5f30c941ca..137d6368c2 100644
--- a/src/NETCDF/dump_netcdf.cpp
+++ b/src/NETCDF/dump_netcdf.cpp
@@ -19,6 +19,7 @@
 #if defined(LMP_HAS_NETCDF)
 
 #include "dump_netcdf.h"
+#include "netcdf_units.h"
 
 #include "atom.h"
 #include "comm.h"
@@ -43,6 +44,9 @@
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using NetCDFUnits::Quantity;
+using NetCDFUnits::get_unit_for;
+using NetCDFUnits::LMP_MAX_VAR_DIMS;
 
 static const char NC_FRAME_STR[]         = "frame";
 static const char NC_SPATIAL_STR[]       = "spatial";
@@ -63,7 +67,6 @@ static const char NC_SCALE_FACTOR_STR[]  = "scale_factor";
 static constexpr int THIS_IS_A_FIX      = -1;
 static constexpr int THIS_IS_A_COMPUTE  = -2;
 static constexpr int THIS_IS_A_VARIABLE = -3;
-static constexpr int THIS_IS_A_BIGINT   = -4;
 
 /* ---------------------------------------------------------------------- */
 
@@ -102,6 +105,7 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
     int ndims = 1;
     std::string mangled = earg[i];
     bool constant = false;
+    int quantity = Quantity::UNKNOWN;
 
     // name mangling
     // in the AMBER specification
@@ -109,26 +113,32 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "coordinates";
+      quantity = Quantity::DISTANCE;
     } else if ((mangled == "vx") || (mangled == "vy") || (mangled == "vz")) {
       idim = mangled[1] - 'x';
       ndims = 3;
       mangled = "velocities";
+      quantity = Quantity::VELOCITY;
     } else if ((mangled == "xs") || (mangled == "ys") || (mangled == "zs")) {
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "scaled_coordinates";
+      // no unit for scaled coordinates
     } else if ((mangled == "xu") || (mangled == "yu") || (mangled == "zu")) {
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "unwrapped_coordinates";
+      quantity = Quantity::DISTANCE;
     } else if ((mangled == "fx") || (mangled == "fy") || (mangled == "fz")) {
       idim = mangled[1] - 'x';
       ndims = 3;
       mangled = "forces";
+      quantity = Quantity::FORCE;
     } else if ((mangled == "mux") || (mangled == "muy") || (mangled == "muz")) {
       idim = mangled[2] - 'x';
       ndims = 3;
       mangled = "mu";
+      quantity = Quantity::DIPOLE_MOMENT;
     } else if (utils::strmatch(mangled, "^c_")) {
       std::size_t found = mangled.find('[');
       if (found != std::string::npos) {
@@ -175,13 +185,14 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
     perat[inc].constant = constant;
     perat[inc].ndumped = 0;
     perat[inc].field[idim] = i;
+    perat[inc].quantity = quantity;
   }
 
   n_buffer = 0;
   int_buffer = nullptr;
   double_buffer = nullptr;
 
-  double_precision = false;
+  type_nc_real = NC_FLOAT;
 
   thermo = false;
   thermovar = nullptr;
@@ -196,7 +207,7 @@ DumpNetCDF::~DumpNetCDF()
   closefile();
 
   delete[] perat;
-  if (thermovar) delete[] thermovar;
+  delete[] thermovar;
 
   if (int_buffer) memory->sfree(int_buffer);
   if (double_buffer) memory->sfree(double_buffer);
@@ -224,7 +235,7 @@ void DumpNetCDF::openfile()
   }
 
   if (thermo && !singlefile_opened) {
-    if (thermovar)  delete[] thermovar;
+    delete[] thermovar;
     thermovar = new int[output->thermo->nfield];
   }
 
@@ -290,18 +301,18 @@ void DumpNetCDF::openfile()
       NCERRX( nc_inq_dimid(ncid, NC_LABEL_STR, &label_dim), NC_LABEL_STR );
 
       for (int i = 0; i < n_perat; i++) {
-        int dims = perat[i].dims;
-        if (vector_dim[dims] < 0) {
+        int dim = perat[i].dims;
+        if (vector_dim[dim] < 0) {
           char dimstr[1024];
-          if (dims == 3) {
+          if (dim == 3) {
             strcpy(dimstr, NC_SPATIAL_STR);
-          } else if (dims == 6) {
+          } else if (dim == 6) {
             strcpy(dimstr, NC_VOIGT_STR);
           } else {
-            sprintf(dimstr, "vec%i", dims);
+            sprintf(dimstr, "vec%i", dim);
           }
-          if (dims != 1) {
-            NCERRX( nc_inq_dimid(ncid, dimstr, &vector_dim[dims]), dimstr );
+          if (dim != 1) {
+            NCERRX( nc_inq_dimid(ncid, dimstr, &vector_dim[dim]), dimstr );
           }
         }
       }
@@ -339,9 +350,8 @@ void DumpNetCDF::openfile()
       if (framei != 0 && !multifile)
         error->all(FLERR,"at keyword requires use of 'append yes'");
 
-      int dims[NC_MAX_VAR_DIMS];
-      size_t index[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
-      double d[1];
+      int dims[LMP_MAX_VAR_DIMS];
+      size_t index[LMP_MAX_VAR_DIMS], count[LMP_MAX_VAR_DIMS];
 
       if (singlefile_opened) return;
       singlefile_opened = 1;
@@ -373,22 +383,22 @@ void DumpNetCDF::openfile()
       }
 
       // default variables
-      dims[0] = 0;
+      dims[0] = vector_dim[3];
       NCERRX( nc_def_var(ncid, NC_SPATIAL_STR, NC_CHAR, 1, dims, &spatial_var), NC_SPATIAL_STR );
       NCERRX( nc_def_var(ncid, NC_CELL_SPATIAL_STR, NC_CHAR, 1, dims, &cell_spatial_var), NC_CELL_SPATIAL_STR );
-      dims[0] = 0;
+      dims[0] = vector_dim[3];
       dims[1] = label_dim;
       NCERRX( nc_def_var(ncid, NC_CELL_ANGULAR_STR, NC_CHAR, 2, dims, &cell_angular_var), NC_CELL_ANGULAR_STR );
 
       dims[0] = frame_dim;
-      NCERRX( nc_def_var(ncid, NC_TIME_STR, NC_DOUBLE, 1, dims, &time_var), NC_TIME_STR);
+      NCERRX( nc_def_var(ncid, NC_TIME_STR, type_nc_real, 1, dims, &time_var), NC_TIME_STR);
       dims[0] = frame_dim;
       dims[1] = cell_spatial_dim;
-      NCERRX( nc_def_var(ncid, NC_CELL_ORIGIN_STR, NC_DOUBLE, 2, dims, &cell_origin_var), NC_CELL_ORIGIN_STR );
-      NCERRX( nc_def_var(ncid, NC_CELL_LENGTHS_STR, NC_DOUBLE, 2, dims, &cell_lengths_var), NC_CELL_LENGTHS_STR );
+      NCERRX( nc_def_var(ncid, NC_CELL_ORIGIN_STR, type_nc_real, 2, dims, &cell_origin_var), NC_CELL_ORIGIN_STR );
+      NCERRX( nc_def_var(ncid, NC_CELL_LENGTHS_STR, type_nc_real, 2, dims, &cell_lengths_var), NC_CELL_LENGTHS_STR );
       dims[0] = frame_dim;
       dims[1] = cell_angular_dim;
-      NCERRX( nc_def_var(ncid, NC_CELL_ANGLES_STR, NC_DOUBLE, 2, dims, &cell_angles_var), NC_CELL_ANGLES_STR );
+      NCERRX( nc_def_var(ncid, NC_CELL_ANGLES_STR, type_nc_real, 2, dims, &cell_angles_var), NC_CELL_ANGLES_STR );
 
       // variables specified in the input file
       dims[0] = frame_dim;
@@ -397,7 +407,6 @@ void DumpNetCDF::openfile()
 
       for (int i = 0; i < n_perat; i++) {
         nc_type xtype;
-
         // Type mangling
         if (vtype[perat[i].field[0]] == Dump::INT) {
           xtype = NC_INT;
@@ -406,10 +415,7 @@ void DumpNetCDF::openfile()
         } else if (vtype[perat[i].field[0]] == Dump::STRING) {
           error->all(FLERR,"Dump netcdf currently does not support dumping string properties");
         } else {
-          if (double_precision)
-            xtype = NC_DOUBLE;
-          else
-            xtype = NC_FLOAT;
+          xtype = type_nc_real;
         }
 
         if (perat[i].constant) {
@@ -430,6 +436,11 @@ void DumpNetCDF::openfile()
             NCERRX( nc_def_var(ncid, perat[i].name, xtype, 3, dims, &perat[i].var), perat[i].name );
           }
         }
+
+        std::string unit = get_unit_for(update->unit_style, perat[i].quantity, error);
+        if (!unit.empty()) {
+          NCERR( nc_put_att_text(ncid, perat[i].var, NC_UNITS_STR, unit.size(), unit.c_str()) );
+        }
       }
 
       // perframe variables
@@ -437,7 +448,7 @@ void DumpNetCDF::openfile()
         Thermo *th = output->thermo;
         for (int i = 0; i < th->nfield; i++) {
           if (th->vtype[i] == Thermo::FLOAT) {
-            NCERRX( nc_def_var(ncid, th->keyword[i], NC_DOUBLE, 1, dims,
+            NCERRX( nc_def_var(ncid, th->keyword[i], type_nc_real, 1, dims,
                                &thermovar[i]), th->keyword[i] );
           } else if (th->vtype[i] == Thermo::INT) {
             NCERRX( nc_def_var(ncid, th->keyword[i], NC_INT, 1, dims,
@@ -461,43 +472,18 @@ void DumpNetCDF::openfile()
       NCERR( nc_put_att_text(ncid, NC_GLOBAL, "program", 6, "LAMMPS") );
       NCERR( nc_put_att_text(ncid, NC_GLOBAL, "programVersion",strlen(lmp->version), lmp->version) );
 
-      // units
-      if (!strcmp(update->unit_style, "lj")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 2, "lj") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 2, "lj") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 2, "lj") );
-      } else if (!strcmp(update->unit_style, "real")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 11, "femtosecond") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 8, "Angstrom") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 8, "Angstrom") );
-      } else if (!strcmp(update->unit_style, "metal")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 10, "picosecond") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 8, "Angstrom") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 8, "Angstrom") );
-      } else if (!strcmp(update->unit_style, "si")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 6, "second") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 5, "meter") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 5, "meter") );
-      } else if (!strcmp(update->unit_style, "cgs")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 6, "second") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 10, "centimeter") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 10, "centimeter") );
-      } else if (!strcmp(update->unit_style, "electron")) {
-        NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, 11, "femtosecond") );
-        NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 4, "Bohr") );
-        NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 4, "Bohr") );
-      } else {
-        error->all(FLERR,"Unsupported unit style: {}", update->unit_style);
-      }
+      // units & scale
+      std::string unit = get_unit_for(update->unit_style, Quantity::TIME, error);
+      NCERR( nc_put_att_text(ncid, time_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
 
-      NCERR( nc_put_att_text(ncid, cell_angles_var, NC_UNITS_STR,6, "degree") );
+      unit = get_unit_for(update->unit_style, Quantity::DISTANCE, error);
+      NCERR( nc_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
+      NCERR( nc_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
 
-      d[0] = update->dt;
-      NCERR( nc_put_att_double(ncid, time_var, NC_SCALE_FACTOR_STR,NC_DOUBLE, 1, d) );
-      d[0] = 1.0;
-      NCERR( nc_put_att_double(ncid, cell_origin_var, NC_SCALE_FACTOR_STR,NC_DOUBLE, 1, d) );
-      d[0] = 1.0;
-      NCERR( nc_put_att_double(ncid, cell_lengths_var, NC_SCALE_FACTOR_STR,NC_DOUBLE, 1, d) );
+      NCERR( nc_put_att_text(ncid, cell_angles_var, NC_UNITS_STR, 6, "degree") );
+
+      float scale[1] = {static_cast<float>(update->dt)};
+      NCERR( nc_put_att_float(ncid, time_var, NC_SCALE_FACTOR_STR, NC_FLOAT, 1, scale) );
 
       /*
        * Finished with definition
@@ -735,8 +721,8 @@ void DumpNetCDF::write_header(bigint n)
 
 void DumpNetCDF::write_data(int n, double *mybuf)
 {
-  size_t start[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
-  ptrdiff_t stride[NC_MAX_VAR_DIMS];
+  size_t start[LMP_MAX_VAR_DIMS], count[LMP_MAX_VAR_DIMS];
+  ptrdiff_t stride[LMP_MAX_VAR_DIMS];
 
   if (!int_buffer) {
     n_buffer = n;
@@ -872,7 +858,12 @@ int DumpNetCDF::modify_param(int narg, char **arg)
   if (strcmp(arg[iarg],"double") == 0) {
     iarg++;
     if (iarg >= narg) error->all(FLERR,"expected 'yes' or 'no' after 'double' keyword.");
-    double_precision = utils::logical(FLERR,arg[iarg],false,lmp) == 1;
+
+    if (utils::logical(FLERR,arg[iarg],false,lmp) == 1)
+      type_nc_real = NC_DOUBLE;
+    else
+      type_nc_real = NC_FLOAT;
+
     iarg++;
     return 2;
   } else if (strcmp(arg[iarg],"at") == 0) {
@@ -897,10 +888,10 @@ int DumpNetCDF::modify_param(int narg, char **arg)
 void DumpNetCDF::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
-    if (descr) error->one(FLERR,"NetCDF failed with error '{}' (while accessing '{}') "
-                          " in line {} of {}.", nc_strerror(err), descr, line, __FILE__);
-    else error->one(FLERR,"NetCDF failed with error '{}' in line {} of {}.",
-                    nc_strerror(err), line, __FILE__);
+    if (descr) error->one(__FILE__, line, "NetCDF failed with error '{}' (while accessing '{}') ",
+                          nc_strerror(err), descr);
+    else error->one(__FILE__, line,"NetCDF failed with error '{}' in line {} of {}.",
+                    nc_strerror(err));
   }
 }
 
diff --git a/src/NETCDF/dump_netcdf.h b/src/NETCDF/dump_netcdf.h
index dd9c50873e..f3a4e81d9c 100644
--- a/src/NETCDF/dump_netcdf.h
+++ b/src/NETCDF/dump_netcdf.h
@@ -24,15 +24,12 @@ DumpStyle(netcdf,DumpNetCDF);
 #else
 
 #ifndef LMP_DUMP_NETCDF_H
-#define LMP_DUMP_NETCDFC_H
+#define LMP_DUMP_NETCDF_H
 
 #include "dump_custom.h"
 
 namespace LAMMPS_NS {
 
-const int NC_FIELD_NAME_MAX = 100;
-const int DUMP_NC_MAX_DIMS = 100;
-
 class DumpNetCDF : public DumpCustom {
  public:
   DumpNetCDF(class LAMMPS *, int, char **);
@@ -40,12 +37,16 @@ class DumpNetCDF : public DumpCustom {
   virtual void write();
 
  private:
+  static constexpr int NC_FIELD_NAME_MAX = 100;
+  static constexpr int DUMP_NC_MAX_DIMS = 100;
+
   // per-atoms quantities (positions, velocities, etc.)
   struct nc_perat_t {
     int dims;                        // number of dimensions
     int field[DUMP_NC_MAX_DIMS];     // field indices corresponding to the dim.
     char name[NC_FIELD_NAME_MAX];    // field name
     int var;                         // NetCDF variable
+    int quantity;                    // type of the quantity
 
     bool constant;    // is this property per file (not per frame)
     int ndumped;      // number of enties written for this prop.
@@ -62,8 +63,8 @@ class DumpNetCDF : public DumpCustom {
 
   int *thermovar;    // NetCDF variables for thermo output
 
-  bool double_precision;    // write everything as double precision
-  bool thermo;              // write thermo output to netcdf file
+  int type_nc_real;    // netcdf type to use for real variables: float or double
+  bool thermo;         // write thermo output to netcdf file
 
   bigint n_buffer;          // size of buffer
   bigint *int_buffer;       // buffer for passing data to netcdf
diff --git a/src/NETCDF/dump_netcdf_mpiio.cpp b/src/NETCDF/dump_netcdf_mpiio.cpp
index 0a76203f96..a1c9d20e61 100644
--- a/src/NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/NETCDF/dump_netcdf_mpiio.cpp
@@ -19,6 +19,7 @@
 #if defined(LMP_HAS_PNETCDF)
 
 #include "dump_netcdf_mpiio.h"
+#include "netcdf_units.h"
 
 #include "atom.h"
 #include "comm.h"
@@ -43,6 +44,9 @@
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using NetCDFUnits::Quantity;
+using NetCDFUnits::get_unit_for;
+using NetCDFUnits::LMP_MAX_VAR_DIMS;
 
 static const char NC_FRAME_STR[]         = "frame";
 static const char NC_SPATIAL_STR[]       = "spatial";
@@ -63,7 +67,6 @@ static const char NC_SCALE_FACTOR_STR[]  = "scale_factor";
 static constexpr int THIS_IS_A_FIX      = -1;
 static constexpr int THIS_IS_A_COMPUTE  = -2;
 static constexpr int THIS_IS_A_VARIABLE = -3;
-static constexpr int THIS_IS_A_BIGINT   = -4;
 
 /* ---------------------------------------------------------------------- */
 
@@ -101,7 +104,7 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
     int idim = 0;
     int ndims = 1;
     std::string mangled = earg[i];
-    bool constant = false;
+    int quantity = Quantity::UNKNOWN;
 
     // name mangling
     // in the AMBER specification
@@ -109,26 +112,32 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "coordinates";
+      quantity = Quantity::DISTANCE;
     } else if ((mangled == "vx") || (mangled == "vy") || (mangled == "vz")) {
       idim = mangled[1] - 'x';
       ndims = 3;
       mangled = "velocities";
+      quantity = Quantity::VELOCITY;
     } else if ((mangled == "xs") || (mangled == "ys") || (mangled == "zs")) {
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "scaled_coordinates";
+      // no unit for scaled coordinates
     } else if ((mangled == "xu") || (mangled == "yu") || (mangled == "zu")) {
       idim = mangled[0] - 'x';
       ndims = 3;
       mangled = "unwrapped_coordinates";
+      quantity = Quantity::DISTANCE;
     } else if ((mangled == "fx") || (mangled == "fy") || (mangled == "fz")) {
       idim = mangled[1] - 'x';
       ndims = 3;
       mangled = "forces";
+      quantity = Quantity::FORCE;
     } else if ((mangled == "mux") || (mangled == "muy") || (mangled == "muz")) {
       idim = mangled[2] - 'x';
       ndims = 3;
       mangled = "mu";
+      quantity = Quantity::DIPOLE_MOMENT;
     } else if (utils::strmatch(mangled, "^c_")) {
       std::size_t found = mangled.find('[');
       if (found != std::string::npos) {
@@ -173,13 +182,14 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
     }
 
     perat[inc].field[idim] = i;
+    perat[inc].quantity = quantity;
   }
 
   n_buffer = 0;
   int_buffer = nullptr;
   double_buffer = nullptr;
 
-  double_precision = false;
+  type_nc_real = NC_FLOAT;
 
   thermo = false;
   thermovar = nullptr;
@@ -194,7 +204,7 @@ DumpNetCDFMPIIO::~DumpNetCDFMPIIO()
   closefile();
 
   delete[] perat;
-  if (thermovar) delete[] thermovar;
+  delete[] thermovar;
 
   if (int_buffer) memory->sfree(int_buffer);
   if (double_buffer) memory->sfree(double_buffer);
@@ -211,8 +221,7 @@ void DumpNetCDFMPIIO::openfile()
     char *ptr = strchr(filestar,'*');
     *ptr = '\0';
     if (padflag == 0)
-      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
-              filestar,update->ntimestep,ptr+1);
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s", filestar,update->ntimestep,ptr+1);
     else {
       char bif[8],pad[16];
       strcpy(bif,BIGINT_FORMAT);
@@ -223,7 +232,7 @@ void DumpNetCDFMPIIO::openfile()
   }
 
   if (thermo && !singlefile_opened) {
-    if (thermovar)  delete[] thermovar;
+    delete[] thermovar;
     thermovar = new int[output->thermo->nfield];
   }
 
@@ -275,9 +284,6 @@ void DumpNetCDFMPIIO::openfile()
     if (!platform::file_is_readable(filecurrent))
       error->all(FLERR, "cannot append to non-existent file {}", filecurrent);
 
-    MPI_Offset index[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
-    double d[1];
-
     if (singlefile_opened) return;
     singlefile_opened = 1;
 
@@ -291,18 +297,18 @@ void DumpNetCDFMPIIO::openfile()
     NCERRX( ncmpi_inq_dimid(ncid, NC_LABEL_STR, &label_dim), NC_LABEL_STR );
 
     for (int i = 0; i < n_perat; i++) {
-      int dims = perat[i].dims;
-      if (vector_dim[dims] < 0) {
+      int dim = perat[i].dims;
+      if (vector_dim[dim] < 0) {
         char dimstr[1024];
-        if (dims == 3) {
+        if (dim == 3) {
           strcpy(dimstr, NC_SPATIAL_STR);
-        } else if (dims == 6) {
+        } else if (dim == 6) {
           strcpy(dimstr, NC_VOIGT_STR);
         } else {
-          sprintf(dimstr, "vec%i", dims);
+          sprintf(dimstr, "vec%i", dim);
         }
-        if (dims != 1) {
-          NCERRX( ncmpi_inq_dimid(ncid, dimstr, &vector_dim[dims]), dimstr );
+        if (dim != 1) {
+          NCERRX( ncmpi_inq_dimid(ncid, dimstr, &vector_dim[dim]), dimstr );
         }
       }
     }
@@ -340,9 +346,8 @@ void DumpNetCDFMPIIO::openfile()
     if (framei != 0 && !multifile)
       error->all(FLERR,"at keyword requires use of 'append yes'");
 
-    int dims[NC_MAX_VAR_DIMS];
-    MPI_Offset index[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
-    double d[1];
+    int dims[LMP_MAX_VAR_DIMS];
+    MPI_Offset index[LMP_MAX_VAR_DIMS], count[LMP_MAX_VAR_DIMS];
 
     if (singlefile_opened) return;
     singlefile_opened = 1;
@@ -356,19 +361,24 @@ void DumpNetCDFMPIIO::openfile()
     NCERRX( ncmpi_def_dim(ncid, NC_CELL_ANGULAR_STR, 3, &cell_angular_dim), NC_CELL_ANGULAR_STR );
     NCERRX( ncmpi_def_dim(ncid, NC_LABEL_STR, 10, &label_dim), NC_LABEL_STR );
 
+    if (vector_dim[3] < 0)
+      NCERRX( ncmpi_def_dim(ncid, NC_SPATIAL_STR, 3, &vector_dim[3]), NC_SPATIAL_STR );
+    if (vector_dim[6] < 0)
+      NCERRX( ncmpi_def_dim(ncid, NC_VOIGT_STR, 6, &vector_dim[6]), NC_VOIGT_STR );
+
     for (int i = 0; i < n_perat; i++) {
-      int dims = perat[i].dims;
-      if (vector_dim[dims] < 0) {
+      int dim = perat[i].dims;
+      if (vector_dim[dim] < 0) {
         char dimstr[1024];
-        if (dims == 3) {
+        if (dim == 3) {
           strcpy(dimstr, NC_SPATIAL_STR);
-        } else if (dims == 6) {
+        } else if (dim == 6) {
           strcpy(dimstr, NC_VOIGT_STR);
         } else {
-          sprintf(dimstr, "vec%i", dims);
+          sprintf(dimstr, "vec%i", dim);
         }
-        if (dims != 1) {
-          NCERRX( ncmpi_def_dim(ncid, dimstr, dims, &vector_dim[dims]), dimstr );
+        if (dim != 1) {
+          NCERRX( ncmpi_def_dim(ncid, dimstr, dim, &vector_dim[dim]), dimstr );
         }
       }
     }
@@ -380,16 +390,15 @@ void DumpNetCDFMPIIO::openfile()
     dims[0] = vector_dim[3];
     dims[1] = label_dim;
     NCERRX( ncmpi_def_var(ncid, NC_CELL_ANGULAR_STR, NC_CHAR, 2, dims, &cell_angular_var), NC_CELL_ANGULAR_STR );
-
     dims[0] = frame_dim;
-    NCERRX( ncmpi_def_var(ncid, NC_TIME_STR, NC_DOUBLE, 1, dims, &time_var), NC_TIME_STR);
+    NCERRX( ncmpi_def_var(ncid, NC_TIME_STR, type_nc_real, 1, dims, &time_var), NC_TIME_STR);
     dims[0] = frame_dim;
     dims[1] = cell_spatial_dim;
-    NCERRX( ncmpi_def_var(ncid, NC_CELL_ORIGIN_STR, NC_DOUBLE, 2, dims, &cell_origin_var), NC_CELL_ORIGIN_STR );
-    NCERRX( ncmpi_def_var(ncid, NC_CELL_LENGTHS_STR, NC_DOUBLE, 2, dims, &cell_lengths_var), NC_CELL_LENGTHS_STR );
+    NCERRX( ncmpi_def_var(ncid, NC_CELL_ORIGIN_STR, type_nc_real, 2, dims, &cell_origin_var), NC_CELL_ORIGIN_STR );
+    NCERRX( ncmpi_def_var(ncid, NC_CELL_LENGTHS_STR, type_nc_real, 2, dims, &cell_lengths_var), NC_CELL_LENGTHS_STR );
     dims[0] = frame_dim;
     dims[1] = cell_angular_dim;
-    NCERRX( ncmpi_def_var(ncid, NC_CELL_ANGLES_STR, NC_DOUBLE, 2, dims, &cell_angles_var), NC_CELL_ANGLES_STR );
+    NCERRX( ncmpi_def_var(ncid, NC_CELL_ANGLES_STR, type_nc_real, 2, dims, &cell_angles_var), NC_CELL_ANGLES_STR );
 
     // variables specified in the input file
     dims[0] = frame_dim;
@@ -405,10 +414,7 @@ void DumpNetCDFMPIIO::openfile()
       } else if (vtype[perat[i].field[0]] == Dump::BIGINT) {
         xtype = NC_INT64;
       } else {
-        if (double_precision)
-          xtype = NC_DOUBLE;
-        else
-          xtype = NC_FLOAT;
+        xtype = type_nc_real;
       }
 
       if (perat[i].dims == 1) {
@@ -418,6 +424,11 @@ void DumpNetCDFMPIIO::openfile()
         dims[2] = vector_dim[perat[i].dims];
         NCERRX( ncmpi_def_var(ncid, perat[i].name, xtype, 3, dims, &perat[i].var), perat[i].name );
       }
+
+      std::string unit = get_unit_for(update->unit_style, perat[i].quantity, error);
+      if (!unit.empty()) {
+        NCERR( ncmpi_put_att_text(ncid, perat[i].var, NC_UNITS_STR, unit.size(), unit.c_str()) );
+      }
     }
 
     // perframe variables
@@ -425,7 +436,7 @@ void DumpNetCDFMPIIO::openfile()
       Thermo *th = output->thermo;
       for (int i = 0; i < th->nfield; i++) {
         if (th->vtype[i] == Thermo::FLOAT) {
-          NCERRX( ncmpi_def_var(ncid, th->keyword[i], NC_DOUBLE, 1, dims, &thermovar[i]), th->keyword[i] );
+          NCERRX( ncmpi_def_var(ncid, th->keyword[i], type_nc_real, 1, dims, &thermovar[i]), th->keyword[i] );
         } else if (th->vtype[i] == Thermo::INT) {
           NCERRX( ncmpi_def_var(ncid, th->keyword[i], NC_INT, 1, dims, &thermovar[i]), th->keyword[i] );
         } else if (th->vtype[i] == Thermo::BIGINT) {
@@ -445,43 +456,18 @@ void DumpNetCDFMPIIO::openfile()
     NCERR( ncmpi_put_att_text(ncid, NC_GLOBAL, "program", 6, "LAMMPS") );
     NCERR( ncmpi_put_att_text(ncid, NC_GLOBAL, "programVersion", strlen(lmp->version), lmp->version) );
 
-    // units
-    if (!strcmp(update->unit_style, "lj")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 2, "lj") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 2, "lj") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 2, "lj") );
-    } else if (!strcmp(update->unit_style, "real")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 11, "femtosecond") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 8, "Angstrom") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 8, "Angstrom") );
-    } else if (!strcmp(update->unit_style, "metal")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 10, "picosecond") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 8, "Angstrom") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 8, "Angstrom") );
-    } else if (!strcmp(update->unit_style, "si")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 6, "second") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 5, "meter") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 5, "meter") );
-    } else if (!strcmp(update->unit_style, "cgs")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 6, "second") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 10, "centimeter") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 10, "centimeter") );
-    } else if (!strcmp(update->unit_style, "electron")) {
-      NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, 11, "femtosecond") );
-      NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, 4, "Bohr") );
-      NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, 4, "Bohr") );
-    } else {
-      error->all(FLERR,"Unsupported unit style: {}", update->unit_style);
-    }
+    // units & scale
+    std::string unit = get_unit_for(update->unit_style, Quantity::TIME, error);
+    NCERR( ncmpi_put_att_text(ncid, time_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
+
+    unit = get_unit_for(update->unit_style, Quantity::DISTANCE, error);
+    NCERR( ncmpi_put_att_text(ncid, cell_origin_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
+    NCERR( ncmpi_put_att_text(ncid, cell_lengths_var, NC_UNITS_STR, unit.size(), unit.c_str()) );
 
     NCERR( ncmpi_put_att_text(ncid, cell_angles_var, NC_UNITS_STR, 6, "degree") );
 
-    d[0] = update->dt;
-    NCERR( ncmpi_put_att_double(ncid, time_var, NC_SCALE_FACTOR_STR, NC_DOUBLE, 1, d) );
-    d[0] = 1.0;
-    NCERR( ncmpi_put_att_double(ncid, cell_origin_var, NC_SCALE_FACTOR_STR, NC_DOUBLE, 1, d) );
-    d[0] = 1.0;
-    NCERR( ncmpi_put_att_double(ncid, cell_lengths_var, NC_SCALE_FACTOR_STR, NC_DOUBLE, 1, d) );
+    float scale[1] = {static_cast<float>(update->dt)};
+    NCERR( ncmpi_put_att_float(ncid, time_var, NC_SCALE_FACTOR_STR, NC_FLOAT, 1, scale) );
 
     /*
      * Finished with definition
@@ -502,16 +488,13 @@ void DumpNetCDFMPIIO::openfile()
       index[1] = 0;
       count[0] = 1;
       count[1] = 5;
-      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count,
-                                 "alpha") );
+      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count, "alpha") );
       index[0] = 1;
       count[1] = 4;
-      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count,
-                                 "beta") );
+      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count, "beta") );
       index[0] = 2;
       count[1] = 5;
-      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count,
-                                 "gamma") );
+      NCERR( ncmpi_put_vara_text(ncid, cell_angular_var, index, count, "gamma") );
     }
 
     NCERR( ncmpi_end_indep_data(ncid) );
@@ -753,8 +736,7 @@ void DumpNetCDFMPIIO::write_time_and_cell()
 
 void DumpNetCDFMPIIO::write_data(int n, double *mybuf)
 {
-  MPI_Offset start[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
-  MPI_Offset stride[NC_MAX_VAR_DIMS];
+  MPI_Offset start[LMP_MAX_VAR_DIMS], count[LMP_MAX_VAR_DIMS], stride[LMP_MAX_VAR_DIMS];
 
   if (!int_buffer) {
     n_buffer = std::max(1, n);
@@ -867,7 +849,12 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg)
   if (strcmp(arg[iarg],"double") == 0) {
     iarg++;
     if (iarg >= narg) error->all(FLERR,"expected 'yes' or 'no' after 'double' keyword.");
-    double_precision = utils::logical(FLERR,arg[iarg],false,lmp) == 1;
+
+    if (utils::logical(FLERR,arg[iarg],false,lmp) == 1)
+      type_nc_real = NC_DOUBLE;
+    else
+      type_nc_real = NC_FLOAT;
+
     iarg++;
     return 2;
   } else if (strcmp(arg[iarg],"at") == 0) {
@@ -892,10 +879,9 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg)
 void DumpNetCDFMPIIO::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
-    if (descr) error->one(FLERR,"NetCDF failed with error '{}' (while accessing '{}') "
-                          " in line {} of {}.", ncmpi_strerror(err), descr, line, __FILE__);
-    else error->one(FLERR,"NetCDF failed with error '{}' in line {} of {}.",
-                    ncmpi_strerror(err), line, __FILE__);
+    if (descr) error->one(__FILE__, line, "NetCDF failed with error '{}' (while accessing '{}') ",
+                          ncmpi_strerror(err), descr);
+    else error->one(__FILE__, line,"NetCDF failed with error '{}'.", ncmpi_strerror(err));
   }
 }
 
diff --git a/src/NETCDF/dump_netcdf_mpiio.h b/src/NETCDF/dump_netcdf_mpiio.h
index 56c07fc3d3..ec6cbaec04 100644
--- a/src/NETCDF/dump_netcdf_mpiio.h
+++ b/src/NETCDF/dump_netcdf_mpiio.h
@@ -30,9 +30,6 @@ DumpStyle(netcdf/mpiio,DumpNetCDFMPIIO);
 
 namespace LAMMPS_NS {
 
-const int NC_MPIIO_FIELD_NAME_MAX = 100;
-const int DUMP_NC_MPIIO_MAX_DIMS = 100;
-
 class DumpNetCDFMPIIO : public DumpCustom {
  public:
   DumpNetCDFMPIIO(class LAMMPS *, int, char **);
@@ -40,16 +37,18 @@ class DumpNetCDFMPIIO : public DumpCustom {
   virtual void write();
 
  private:
+  static constexpr int NC_MPIIO_FIELD_NAME_MAX = 100;
+  static constexpr int DUMP_NC_MPIIO_MAX_DIMS = 100;
+
   // per-atoms quantities (positions, velocities, etc.)
   struct nc_perat_t {
     int dims;                              // number of dimensions
     int field[DUMP_NC_MPIIO_MAX_DIMS];     // field indices corresponding to the dim.
     char name[NC_MPIIO_FIELD_NAME_MAX];    // field name
     int var;                               // NetCDF variable
+    int quantity;                          // type of the quantity
   };
 
-  typedef void (DumpNetCDFMPIIO::*funcptr_t)(void *);
-
   int framei;    // current frame index
   int blocki;    // current block index
   int ndata;     // number of data blocks to expect
@@ -61,8 +60,8 @@ class DumpNetCDFMPIIO : public DumpCustom {
 
   int *thermovar;    // NetCDF variables for thermo output
 
-  bool double_precision;    // write everything as double precision
-  bool thermo;              // write thermo output to netcdf file
+  int type_nc_real;    // netcdf type to use for real variables: float or double
+  bool thermo;         // write thermo output to netcdf file
 
   bigint n_buffer;          // size of buffer
   bigint *int_buffer;       // buffer for passing data to netcdf
diff --git a/src/NETCDF/netcdf_units.cpp b/src/NETCDF/netcdf_units.cpp
new file mode 100644
index 0000000000..0ee0ebbde0
--- /dev/null
+++ b/src/NETCDF/netcdf_units.cpp
@@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Lars Pastewka (University of Freiburg)
+------------------------------------------------------------------------- */
+
+#if defined(LMP_HAS_NETCDF) || defined(LMP_HAS_PNETCDF)
+
+#include "netcdf_units.h"
+
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+std::string NetCDFUnits::get_unit_for(const char *unit_style, int quantity, Error *error)
+{
+  if (!strcmp(unit_style, "lj")) {
+    if (quantity == Quantity::UNKNOWN) {
+      return "";
+    } else {
+      return "lj";
+    }
+  } else if (!strcmp(unit_style, "real")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "femtosecond";
+      case Quantity::DISTANCE:
+        return "angstrom";
+      case Quantity::VELOCITY:
+        return "angstrom/femtosecond";
+      case Quantity::FORCE:
+        return "(Kcal/mol)/angstrom)";
+      case Quantity::DIPOLE_MOMENT:
+        return "e * angstrom";
+    }
+  } else if (!strcmp(unit_style, "metal")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "picosecond";
+      case Quantity::DISTANCE:
+        return "angstrom";
+      case Quantity::VELOCITY:
+        return "angstrom/picosecond";
+      case Quantity::FORCE:
+        return "eV/angstrom";
+      case Quantity::DIPOLE_MOMENT:
+        return "e * angstrom";
+    }
+  } else if (!strcmp(unit_style, "si")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "second";
+      case Quantity::DISTANCE:
+        return "meter";
+      case Quantity::VELOCITY:
+        return "meter/second";
+      case Quantity::FORCE:
+        return "Newton";
+      case Quantity::DIPOLE_MOMENT:
+        return "Coulomb * meter";
+    }
+  } else if (!strcmp(unit_style, "cgs")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "second";
+      case Quantity::DISTANCE:
+        return "centimeter";
+      case Quantity::VELOCITY:
+        return "centimeter/second";
+      case Quantity::FORCE:
+        return "dynes";
+      case Quantity::DIPOLE_MOMENT:
+        return "statcoul * cm";
+    }
+  } else if (!strcmp(unit_style, "electron")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "femtoseconds";
+      case Quantity::DISTANCE:
+        return "Bohr";
+      case Quantity::VELOCITY:
+        return "Bohr/atomic time units";
+      case Quantity::FORCE:
+        return "Hartree/Bohr";
+      case Quantity::DIPOLE_MOMENT:
+        return "Debye";
+    }
+  } else if (!strcmp(unit_style, "micro")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "microseconds";
+      case Quantity::DISTANCE:
+        return "micrometers";
+      case Quantity::VELOCITY:
+        return "micrometers/microsecond";
+      case Quantity::FORCE:
+        return "picogram * micrometer/microsecond^2";
+      case Quantity::DIPOLE_MOMENT:
+        return "picocoulomb * micrometer";
+    }
+  } else if (!strcmp(unit_style, "nano")) {
+    switch (quantity) {
+      case Quantity::UNKNOWN:
+        return "";
+      case Quantity::TIME:
+        return "nanoseconds";
+      case Quantity::DISTANCE:
+        return "nanometers";
+      case Quantity::VELOCITY:
+        return "nanometers/nanosecond";
+      case Quantity::FORCE:
+        return "attogram * nanometer/nanosecond^2";
+      case Quantity::DIPOLE_MOMENT:
+        return "e * nanometer";
+    }
+  }
+
+  error->all(FLERR, "Unsupported unit style: {}", unit_style);
+  return "";
+}
+
+#endif
diff --git a/src/NETCDF/netcdf_units.h b/src/NETCDF/netcdf_units.h
new file mode 100644
index 0000000000..85f9b05888
--- /dev/null
+++ b/src/NETCDF/netcdf_units.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Lars Pastewka (University of Freiburg), Guillaume Fraux (EPFL)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NETCDF_UNITS_H
+#define LMP_NETCDF_UNITS_H
+
+#if defined(LMP_HAS_NETCDF) || defined(LMP_HAS_PNETCDF)
+
+#include <string>
+
+namespace LAMMPS_NS {
+class Error;
+
+namespace NetCDFUnits {
+  // type of quantity for per-atom values (used to get the unit)
+  enum Quantity {
+    UNKNOWN = 0,
+    TIME,
+    DISTANCE,
+    VELOCITY,
+    FORCE,
+    DIPOLE_MOMENT,
+  };
+
+  // for compatibility with older NetCDF versions
+  static constexpr int LMP_MAX_VAR_DIMS = 1024;
+
+  // get the name of the unit for the given `quantity` in the given LAMMPS
+  // `unit_style` any error will be reported through `error`
+  std::string get_unit_for(const char *unit_style, int quantity, Error *error);
+}    // namespace NetCDFUnits
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/OPENMP/angle_class2_omp.cpp b/src/OPENMP/angle_class2_omp.cpp
index 02ad63cd56..54aa062d44 100644
--- a/src/OPENMP/angle_class2_omp.cpp
+++ b/src/OPENMP/angle_class2_omp.cpp
@@ -175,6 +175,8 @@ void AngleClass2OMP::eval(int nfrom, int nto, ThrData * const thr)
 
     // force & energy for bond-angle term
 
+    dr1 = r1 - ba_r1[type];
+    dr2 = r2 - ba_r2[type];
     aa1 = s * dr1 * ba_k1[type];
     aa2 = s * dr2 * ba_k2[type];
 
diff --git a/src/OPENMP/angle_table_omp.cpp b/src/OPENMP/angle_table_omp.cpp
index 892f9295a5..cca34a67f7 100644
--- a/src/OPENMP/angle_table_omp.cpp
+++ b/src/OPENMP/angle_table_omp.cpp
@@ -16,15 +16,16 @@
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
-#include "omp_compat.h"
 #include "angle_table_omp.h"
-#include <cmath>
+
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 
+#include <cmath>
 
+#include "omp_compat.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
diff --git a/src/OPENMP/bond_table_omp.cpp b/src/OPENMP/bond_table_omp.cpp
index faadca456a..dcc13c85c9 100644
--- a/src/OPENMP/bond_table_omp.cpp
+++ b/src/OPENMP/bond_table_omp.cpp
@@ -16,16 +16,16 @@
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
-#include "omp_compat.h"
 #include "bond_table_omp.h"
+
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 
-
 #include <cmath>
 
+#include "omp_compat.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
diff --git a/src/OPENMP/fix_rigid_small_omp.h b/src/OPENMP/fix_rigid_small_omp.h
index fcbb78534d..82b0d1b8cd 100644
--- a/src/OPENMP/fix_rigid_small_omp.h
+++ b/src/OPENMP/fix_rigid_small_omp.h
@@ -21,12 +21,16 @@ FixStyle(rigid/small/omp,FixRigidSmallOMP);
 #define LMP_FIX_RIGID_SMALL_OMP_H
 
 #include "fix_rigid_small.h"
+#include "force.h"
 
 namespace LAMMPS_NS {
 
 class FixRigidSmallOMP : public FixRigidSmall {
  public:
-  FixRigidSmallOMP(class LAMMPS *lmp, int narg, char **args) : FixRigidSmall(lmp, narg, args){};
+  FixRigidSmallOMP(class LAMMPS *lmp, int narg, char **args) : FixRigidSmall(lmp, narg, args)
+  {
+    centroidstressflag = CENTROID_NOTAVAIL;
+  }
   virtual ~FixRigidSmallOMP(){};
 
   virtual void initial_integrate(int);
diff --git a/src/OPENMP/pair_sw_mod_omp.cpp b/src/OPENMP/pair_sw_mod_omp.cpp
new file mode 100644
index 0000000000..d87abc1d2c
--- /dev/null
+++ b/src/OPENMP/pair_sw_mod_omp.cpp
@@ -0,0 +1,119 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+                    and Wengen Ouyang (Wuhan U)
+------------------------------------------------------------------------- */
+
+#include "pair_sw_mod_omp.h"
+
+#include "error.h"
+#include "math_const.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+PairSWMODOMP::PairSWMODOMP(LAMMPS *lmp) : PairSWOMP(lmp)
+{
+  delta1 = 0.25;
+  delta2 = 0.35;
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSWMODOMP::settings(int narg, char **arg)
+{
+  // process optional keywords
+
+  int iarg = 0;
+
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"maxdelcs") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal pair_style command");
+      delta1 = utils::numeric(FLERR,arg[iarg+1],false,Pointers::lmp);
+      delta2 = utils::numeric(FLERR,arg[iarg+2],false,Pointers::lmp);
+      iarg += 3;
+      if ((delta1 < 0.0) || (delta1 > 1.0) || (delta2 < 0.0) || (delta2 > 1.0) || (delta1 > delta2))
+        error->all(FLERR,"Illegal values for maxdelcs keyword");
+    } else error->all(FLERR,"Illegal pair_style command");
+  }
+  PairSWOMP::settings(narg-iarg,arg+iarg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSWMODOMP::threebody(Param *paramij, Param *paramik, Param *paramijk,
+                            double rsq1, double rsq2,
+                            double *delr1, double *delr2,
+                            double *fj, double *fk, int eflag, double &eng)
+{
+  double r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1;
+  double r2,rinvsq2,rainv2,gsrainv2,gsrainvsq2,expgsrainv2;
+  double rinv12,cs,delcs,delcssq,facexp,facrad,frad1,frad2;
+  double facang,facang12,csfacang,csfac1,csfac2,factor;
+
+  r1 = sqrt(rsq1);
+  rinvsq1 = 1.0/rsq1;
+  rainv1 = 1.0/(r1 - paramij->cut);
+  gsrainv1 = paramij->sigma_gamma * rainv1;
+  gsrainvsq1 = gsrainv1*rainv1/r1;
+  expgsrainv1 = exp(gsrainv1);
+
+  r2 = sqrt(rsq2);
+  rinvsq2 = 1.0/rsq2;
+  rainv2 = 1.0/(r2 - paramik->cut);
+  gsrainv2 = paramik->sigma_gamma * rainv2;
+  gsrainvsq2 = gsrainv2*rainv2/r2;
+  expgsrainv2 = exp(gsrainv2);
+
+  rinv12 = 1.0/(r1*r2);
+  cs = (delr1[0]*delr2[0] + delr1[1]*delr2[1] + delr1[2]*delr2[2]) * rinv12;
+  delcs = cs - paramijk->costheta;
+
+  // Modification to delcs
+  if(fabs(delcs) >= delta2) delcs = 0.0;
+  else if(fabs(delcs) < delta2 && fabs(delcs) > delta1) {
+    factor = 0.5 + 0.5*cos(MY_PI*(fabs(delcs) - delta1)/(delta2 - delta1));
+    delcs *= factor;
+  }
+  delcssq = delcs*delcs;
+
+  facexp = expgsrainv1*expgsrainv2;
+
+  // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
+  //          facexp*delcssq;
+
+  facrad = paramijk->lambda_epsilon * facexp*delcssq;
+  frad1 = facrad*gsrainvsq1;
+  frad2 = facrad*gsrainvsq2;
+  facang = paramijk->lambda_epsilon2 * facexp*delcs;
+  facang12 = rinv12*facang;
+  csfacang = cs*facang;
+  csfac1 = rinvsq1*csfacang;
+
+  fj[0] = delr1[0]*(frad1+csfac1)-delr2[0]*facang12;
+  fj[1] = delr1[1]*(frad1+csfac1)-delr2[1]*facang12;
+  fj[2] = delr1[2]*(frad1+csfac1)-delr2[2]*facang12;
+
+  csfac2 = rinvsq2*csfacang;
+
+  fk[0] = delr2[0]*(frad2+csfac2)-delr1[0]*facang12;
+  fk[1] = delr2[1]*(frad2+csfac2)-delr1[1]*facang12;
+  fk[2] = delr2[2]*(frad2+csfac2)-delr1[2]*facang12;
+
+  if (eflag) eng = facrad;
+}
diff --git a/src/OPENMP/pair_sw_mod_omp.h b/src/OPENMP/pair_sw_mod_omp.h
new file mode 100644
index 0000000000..7e69ca283b
--- /dev/null
+++ b/src/OPENMP/pair_sw_mod_omp.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(sw/mod/omp,PairSWMODOMP);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_SW_MOD_OMP_H
+#define LMP_PAIR_SW_MOD_OMP_H
+
+#include "pair_sw_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairSWMODOMP : public PairSWOMP {
+
+ public:
+  PairSWMODOMP(class LAMMPS *);
+  virtual ~PairSWMODOMP() {}
+
+ protected:
+  double delta1;
+  double delta2;
+
+  void settings(int, char **);
+  void threebody(Param *, Param *, Param *, double, double, double *, double *, double *, double *,
+                 int, double &);
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/PHONON/Install.sh b/src/PHONON/Install.sh
index 4b64fed114..b465f3b1f7 100755
--- a/src/PHONON/Install.sh
+++ b/src/PHONON/Install.sh
@@ -26,16 +26,6 @@ action () {
   fi
 }
 
-# PHONON uses the parallel FFT wrapper used in PPPM,
-# so we must require the KSPACE package to be installed.
-
-if (test $1 = 1) then
-  if (test ! -e ../fft3d_wrap.h) then
-    echo "Must install KSPACE package with PHONON"
-    exit 1
-  fi
-fi
-
 # list of files with optional dependcies
 
 action fix_phonon.cpp fft3d_wrap.h
diff --git a/src/PLUMED/fix_plumed.cpp b/src/PLUMED/fix_plumed.cpp
index 137cd580a8..dcf82a4374 100644
--- a/src/PLUMED/fix_plumed.cpp
+++ b/src/PLUMED/fix_plumed.cpp
@@ -77,9 +77,9 @@ FixPlumed::FixPlumed(LAMMPS *lmp, int narg, char **arg) :
 
   int api_version=0;
   p->cmd("getApiVersion",&api_version);
-  if ((api_version < 5) || (api_version > 8))
+  if ((api_version < 5) || (api_version > 9))
     error->all(FLERR,"Incompatible API version for PLUMED in fix plumed. "
-               "Only Plumed 2.4.x, 2.5.x, 2.6.x, 2.7.x are tested and supported.");
+               "Only Plumed 2.4.x, 2.5.x, 2.6.x, 2.7.x, 2.8.x are tested and supported.");
 
 #if !defined(MPI_STUBS)
   // If the -partition option is activated then enable
diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index a839469d35..4f44cc7c64 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -63,9 +62,9 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
   MPI_Comm_size(world,&nprocs);
   ntypes = atom->ntypes;
 
-  nevery = atoi(arg[3]);
-  nrepeat = atoi(arg[4]);
-  global_freq = nfreq = atoi(arg[5]);
+  nevery = utils::inumeric(FLERR,arg[3],false,lmp);
+  nrepeat = utils::inumeric(FLERR,arg[4],false,lmp);
+  global_freq = nfreq = utils::inumeric(FLERR,arg[5],false,lmp);
 
   comm_forward = 4;
 
@@ -155,9 +154,9 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
     // set BO cutoff
     if (strcmp(arg[iarg],"cutoff") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal fix reaxff/species command");
-      itype = atoi(arg[iarg+1]);
-      jtype = atoi(arg[iarg+2]);
-      bo_cut = atof(arg[iarg+3]);
+      itype = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      jtype = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
+      bo_cut = utils::numeric(FLERR,arg[iarg+3],false,lmp);
       if (itype > ntypes || jtype > ntypes)
         error->all(FLERR,"Illegal fix reaxff/species command");
       if (itype <= 0 || jtype <= 0)
@@ -187,7 +186,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"position") == 0) {
       if (iarg+3 > narg) error->all(FLERR,"Illegal fix reaxff/species command");
       posflag = 1;
-      posfreq = atoi(arg[iarg+1]);
+      posfreq = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       if (posfreq < nfreq || (posfreq%nfreq != 0))
         error->all(FLERR,"Illegal fix reaxff/species command");
 
@@ -247,8 +246,8 @@ FixReaxFFSpecies::~FixReaxFFSpecies()
     if (posflag && multipos_opened) fclose(pos);
   }
 
-  modify->delete_compute("SPECATOM");
-  modify->delete_fix("SPECBOND");
+  modify->delete_compute(fmt::format("SPECATOM_{}",id));
+  modify->delete_fix(fmt::format("SPECBOND_{}",id));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -288,22 +287,16 @@ void FixReaxFFSpecies::init()
   if (nvalid != update->ntimestep)
     nvalid = update->ntimestep+nfreq;
 
-  // check if this fix has been called twice
-  int count = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"reaxff/species") == 0) count++;
-  if (count > 1 && comm->me == 0)
-    error->warning(FLERR,"More than one fix reaxff/species");
-
   if (!setupflag) {
     // create a compute to store properties
-    modify->add_compute("SPECATOM all SPEC/ATOM q x y z vx vy vz abo01 abo02 abo03 abo04 "
-                        "abo05 abo06 abo07 abo08 abo09 abo10 abo11 abo12 abo13 abo14 "
-                        "abo15 abo16 abo17 abo18 abo19 abo20 abo21 abo22 abo23 abo24");
+    modify->add_compute(fmt::format("SPECATOM_{} all SPEC/ATOM q x y z vx vy vz abo01 abo02 "
+                                    "abo03 abo04 abo05 abo06 abo07 abo08 abo09 abo10 abo11 "
+                                    "abo12 abo13 abo14 abo15 abo16 abo17 abo18 abo19 abo20 "
+                                    "abo21 abo22 abo23 abo24",id));
 
     // create a fix to point to fix_ave_atom for averaging stored properties
-    auto fixcmd = fmt::format("SPECBOND all ave/atom {} {} {}",nevery,nrepeat,nfreq);
-    for (int i = 1; i < 32; ++i) fixcmd += " c_SPECATOM[" + std::to_string(i) + "]";
+    auto fixcmd = fmt::format("SPECBOND_{} all ave/atom {} {} {}",id,nevery,nrepeat,nfreq);
+    for (int i = 1; i < 32; ++i) fixcmd += fmt::format(" c_SPECATOM_{}[{}]",id,i);
     f_SPECBOND = (FixAveAtom *) modify->add_fix(fixcmd);
     setupflag = 1;
   }
diff --git a/src/REAXFF/reaxff_ffield.cpp b/src/REAXFF/reaxff_ffield.cpp
index 34db1c232d..ac22609317 100644
--- a/src/REAXFF/reaxff_ffield.cpp
+++ b/src/REAXFF/reaxff_ffield.cpp
@@ -583,6 +583,7 @@ namespace ReaxFF {
       } catch (std::exception &e) {
         error->one(FLERR,e.what());
       }
+      fclose(fp);
     }
 
     // broadcast global parameters and allocate list on ranks != 0
diff --git a/src/RIGID/fix_rigid.cpp b/src/RIGID/fix_rigid.cpp
index 9c39518508..41843fecf0 100644
--- a/src/RIGID/fix_rigid.cpp
+++ b/src/RIGID/fix_rigid.cpp
@@ -68,6 +68,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
   create_attribute = 1;
   dof_flag = 1;
   enforce2d_flag = 1;
+  centroidstressflag = CENTROID_NOTAVAIL;
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
diff --git a/src/RIGID/fix_rigid_small.cpp b/src/RIGID/fix_rigid_small.cpp
index 5db24a96d5..14742155db 100644
--- a/src/RIGID/fix_rigid_small.cpp
+++ b/src/RIGID/fix_rigid_small.cpp
@@ -73,6 +73,7 @@ FixRigidSmall::FixRigidSmall(LAMMPS *lmp, int narg, char **arg) :
   dof_flag = 1;
   enforce2d_flag = 1;
   stores_ids = 1;
+  centroidstressflag = CENTROID_AVAIL;
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
@@ -785,7 +786,7 @@ void FixRigidSmall::initial_integrate(int vflag)
   // forward communicate updated info of all bodies
 
   commflag = INITIAL;
-  comm->forward_comm_fix(this,26);
+  comm->forward_comm_fix(this,29);
 
   // set coords/orient and velocity/rotation of atoms in rigid bodies
 
@@ -879,6 +880,7 @@ void FixRigidSmall::enforce2d()
     b->xcm[2] = 0.0;
     b->vcm[2] = 0.0;
     b->fcm[2] = 0.0;
+    b->xgc[2] = 0.0;
     b->torque[0] = 0.0;
     b->torque[1] = 0.0;
     b->angmom[0] = 0.0;
@@ -1349,10 +1351,22 @@ void FixRigidSmall::set_xv()
       vr[4] = 0.5*x0*fc2;
       vr[5] = 0.5*x1*fc2;
 
-      v_tally(1,&i,1.0,vr);
+      double rlist[][3] = {x0, x1, x2};
+      double flist[][3] = {0.5*fc0, 0.5*fc1, 0.5*fc2};
+      v_tally(1,&i,1.0,vr,rlist,flist,b->xgc);
     }
   }
 
+  // update the position of geometric center
+  for (int ibody = 0; ibody < nlocal_body + nghost_body; ibody++) {
+    Body *b = &body[ibody];
+    MathExtra::matvec(b->ex_space,b->ey_space,b->ez_space,
+                      b->xgc_body,b->xgc);
+    b->xgc[0] += b->xcm[0];
+    b->xgc[1] += b->xcm[1];
+    b->xgc[2] += b->xcm[2];
+  }
+
   // set orientation, omega, angmom of each extended particle
 
   if (extended) {
@@ -1499,7 +1513,9 @@ void FixRigidSmall::set_v()
       vr[4] = 0.5*x0*fc2;
       vr[5] = 0.5*x1*fc2;
 
-      v_tally(1,&i,1.0,vr);
+      double rlist[][3] = {x0, x1, x2};
+      double flist[][3] = {0.5*fc0, 0.5*fc1, 0.5*fc2};
+      v_tally(1,&i,1.0,vr,rlist,flist,b->xgc);
     }
   }
 
@@ -1905,11 +1921,15 @@ void FixRigidSmall::setup_bodies_static()
   double **x = atom->x;
 
   double *xcm;
+  double *xgc;
 
   for (ibody = 0; ibody < nlocal_body+nghost_body; ibody++) {
     xcm = body[ibody].xcm;
+    xgc = body[ibody].xgc;
     xcm[0] = xcm[1] = xcm[2] = 0.0;
+    xgc[0] = xgc[1] = xgc[2] = 0.0;
     body[ibody].mass = 0.0;
+    body[ibody].natoms = 0;
   }
 
   double unwrap[3];
@@ -1924,22 +1944,31 @@ void FixRigidSmall::setup_bodies_static()
 
     domain->unmap(x[i],xcmimage[i],unwrap);
     xcm = b->xcm;
+    xgc = b->xgc;
     xcm[0] += unwrap[0] * massone;
     xcm[1] += unwrap[1] * massone;
     xcm[2] += unwrap[2] * massone;
+    xgc[0] += unwrap[0];
+    xgc[1] += unwrap[1];
+    xgc[2] += unwrap[2];
     b->mass += massone;
+    b->natoms++;
   }
 
   // reverse communicate xcm, mass of all bodies
 
   commflag = XCM_MASS;
-  comm->reverse_comm_fix(this,4);
+  comm->reverse_comm_fix(this,8);
 
   for (ibody = 0; ibody < nlocal_body; ibody++) {
     xcm = body[ibody].xcm;
+    xgc = body[ibody].xgc;
     xcm[0] /= body[ibody].mass;
     xcm[1] /= body[ibody].mass;
     xcm[2] /= body[ibody].mass;
+    xgc[0] /= body[ibody].natoms;
+    xgc[1] /= body[ibody].natoms;
+    xgc[2] /= body[ibody].natoms;
   }
 
   // set vcm, angmom = 0.0 in case inpfile is used
@@ -2124,12 +2153,22 @@ void FixRigidSmall::setup_bodies_static()
     // create initial quaternion
 
     MathExtra::exyz_to_q(ex,ey,ez,body[ibody].quat);
+
+    // convert geometric center position to principal axis coordinates
+    // xcm is wrapped, but xgc is not initially
+    xcm = body[ibody].xcm;
+    xgc = body[ibody].xgc;
+    double delta[3];
+    MathExtra::sub3(xgc,xcm,delta);
+    domain->minimum_image(delta);
+    MathExtra::transpose_matvec(ex,ey,ez,delta,body[ibody].xgc_body);
+    MathExtra::add3(xcm,delta,xgc);
   }
 
   // forward communicate updated info of all bodies
 
   commflag = INITIAL;
-  comm->forward_comm_fix(this,26);
+  comm->forward_comm_fix(this,29);
 
   // displace = initial atom coords in basis of principal axes
   // set displace = 0.0 for atoms not in any rigid body
@@ -2807,6 +2846,10 @@ void FixRigidSmall::set_molecule(int nlocalprev, tagint tagprev, int imol,
       if (nlocal_body == nmax_body) grow_body();
       Body *b = &body[nlocal_body];
       b->mass = onemols[imol]->masstotal;
+      b->natoms = onemols[imol]->natoms;
+      b->xgc[0] = xgeom[0];
+      b->xgc[1] = xgeom[1];
+      b->xgc[2] = xgeom[2];
 
       // new COM = Q (onemols[imol]->xcm - onemols[imol]->center) + xgeom
       // Q = rotation matrix associated with quat
@@ -2829,6 +2872,12 @@ void FixRigidSmall::set_molecule(int nlocalprev, tagint tagprev, int imol,
       MathExtra::quatquat(quat,onemols[imol]->quat,b->quat);
       MathExtra::q_to_exyz(b->quat,b->ex_space,b->ey_space,b->ez_space);
 
+      MathExtra::transpose_matvec(b->ex_space,b->ey_space,b->ez_space,
+                                  ctr2com_rotate,b->xgc_body);
+      b->xgc_body[0] *= -1;
+      b->xgc_body[1] *= -1;
+      b->xgc_body[2] *= -1;
+
       b->angmom[0] = b->angmom[1] = b->angmom[2] = 0.0;
       b->omega[0] = b->omega[1] = b->omega[2] = 0.0;
       b->conjqm[0] = b->conjqm[1] = b->conjqm[2] = b->conjqm[3] = 0.0;
@@ -2961,7 +3010,7 @@ int FixRigidSmall::pack_forward_comm(int n, int *list, double *buf,
                                      int /*pbc_flag*/, int * /*pbc*/)
 {
   int i,j;
-  double *xcm,*vcm,*quat,*omega,*ex_space,*ey_space,*ez_space,*conjqm;
+  double *xcm,*xgc,*vcm,*quat,*omega,*ex_space,*ey_space,*ez_space,*conjqm;
 
   int m = 0;
 
@@ -2973,6 +3022,10 @@ int FixRigidSmall::pack_forward_comm(int n, int *list, double *buf,
       buf[m++] = xcm[0];
       buf[m++] = xcm[1];
       buf[m++] = xcm[2];
+      xgc = body[bodyown[j]].xgc;
+      buf[m++] = xgc[0];
+      buf[m++] = xgc[1];
+      buf[m++] = xgc[2];
       vcm = body[bodyown[j]].vcm;
       buf[m++] = vcm[0];
       buf[m++] = vcm[1];
@@ -3048,7 +3101,7 @@ int FixRigidSmall::pack_forward_comm(int n, int *list, double *buf,
 void FixRigidSmall::unpack_forward_comm(int n, int first, double *buf)
 {
   int i,j,last;
-  double *xcm,*vcm,*quat,*omega,*ex_space,*ey_space,*ez_space,*conjqm;
+  double *xcm,*xgc,*vcm,*quat,*omega,*ex_space,*ey_space,*ez_space,*conjqm;
 
   int m = 0;
   last = first + n;
@@ -3060,6 +3113,10 @@ void FixRigidSmall::unpack_forward_comm(int n, int first, double *buf)
       xcm[0] = buf[m++];
       xcm[1] = buf[m++];
       xcm[2] = buf[m++];
+      xgc = body[bodyown[i]].xgc;
+      xgc[0] = buf[m++];
+      xgc[1] = buf[m++];
+      xgc[2] = buf[m++];
       vcm = body[bodyown[i]].vcm;
       vcm[0] = buf[m++];
       vcm[1] = buf[m++];
@@ -3135,7 +3192,7 @@ void FixRigidSmall::unpack_forward_comm(int n, int first, double *buf)
 int FixRigidSmall::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,j,m,last;
-  double *fcm,*torque,*vcm,*angmom,*xcm;
+  double *fcm,*torque,*vcm,*angmom,*xcm, *xgc;
 
   m = 0;
   last = first + n;
@@ -3170,10 +3227,15 @@ int FixRigidSmall::pack_reverse_comm(int n, int first, double *buf)
     for (i = first; i < last; i++) {
       if (bodyown[i] < 0) continue;
       xcm = body[bodyown[i]].xcm;
+      xgc = body[bodyown[i]].xgc;
       buf[m++] = xcm[0];
       buf[m++] = xcm[1];
       buf[m++] = xcm[2];
+      buf[m++] = xgc[0];
+      buf[m++] = xgc[1];
+      buf[m++] = xgc[2];
       buf[m++] = body[bodyown[i]].mass;
+      buf[m++] = static_cast<double>(body[bodyown[i]].natoms);
     }
 
   } else if (commflag == ITENSOR) {
@@ -3208,7 +3270,7 @@ int FixRigidSmall::pack_reverse_comm(int n, int first, double *buf)
 void FixRigidSmall::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,k;
-  double *fcm,*torque,*vcm,*angmom,*xcm;
+  double *fcm,*torque,*vcm,*angmom,*xcm, *xgc;
 
   int m = 0;
 
@@ -3245,10 +3307,15 @@ void FixRigidSmall::unpack_reverse_comm(int n, int *list, double *buf)
       j = list[i];
       if (bodyown[j] < 0) continue;
       xcm = body[bodyown[j]].xcm;
+      xgc = body[bodyown[j]].xgc;
       xcm[0] += buf[m++];
       xcm[1] += buf[m++];
       xcm[2] += buf[m++];
+      xgc[0] += buf[m++];
+      xgc[1] += buf[m++];
+      xgc[2] += buf[m++];
       body[bodyown[j]].mass += buf[m++];
+      body[bodyown[j]].natoms += static_cast<int>(buf[m++]);
     }
 
   } else if (commflag == ITENSOR) {
diff --git a/src/RIGID/fix_rigid_small.h b/src/RIGID/fix_rigid_small.h
index 60a4dd1161..e289c179d9 100644
--- a/src/RIGID/fix_rigid_small.h
+++ b/src/RIGID/fix_rigid_small.h
@@ -85,7 +85,9 @@ class FixRigidSmall : public Fix {
 
   struct Body {
     double mass;           // total mass of body
+    int natoms;            // total number of atoms in body
     double xcm[3];         // COM position
+    double xgc[3];         // geometric center position
     double vcm[3];         // COM velocity
     double fcm[3];         // force on COM
     double torque[3];      // torque around COM
@@ -94,6 +96,7 @@ class FixRigidSmall : public Fix {
     double ex_space[3];    // principal axes in space coords
     double ey_space[3];
     double ez_space[3];
+    double xgc_body[3];    // geometric center relative to xcm in body coords
     double angmom[3];    // space-frame angular momentum of body
     double omega[3];     // space-frame omega of body
     double conjqm[4];    // conjugate quaternion momentum
diff --git a/src/RIGID/fix_shake.cpp b/src/RIGID/fix_shake.cpp
index 29739b294c..0904312a75 100644
--- a/src/RIGID/fix_shake.cpp
+++ b/src/RIGID/fix_shake.cpp
@@ -66,6 +66,7 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) :
   create_attribute = 1;
   dof_flag = 1;
   stores_ids = 1;
+  centroidstressflag = CENTROID_AVAIL;
 
   // error check
 
@@ -1764,7 +1765,10 @@ void FixShake::shake(int m)
     v[4] = lamda*r01[0]*r01[2];
     v[5] = lamda*r01[1]*r01[2];
 
-    v_tally(nlist,list,2.0,v);
+    double fpairlist[] = {lamda};
+    double dellist[][3]  = {{r01[0], r01[1], r01[2]}};
+    int pairlist[][2] = {{i0,i1}};
+    v_tally(nlist,list,2.0,v,nlocal,1,pairlist,fpairlist,dellist);
   }
 }
 
@@ -1937,7 +1941,11 @@ void FixShake::shake3(int m)
     v[4] = lamda01*r01[0]*r01[2] + lamda02*r02[0]*r02[2];
     v[5] = lamda01*r01[1]*r01[2] + lamda02*r02[1]*r02[2];
 
-    v_tally(nlist,list,3.0,v);
+    double fpairlist[] = {lamda01, lamda02};
+    double dellist[][3]  = {{r01[0], r01[1], r01[2]},
+                            {r02[0], r02[1], r02[2]}};
+    int pairlist[][2] = {{i0,i1}, {i0,i2}};
+    v_tally(nlist,list,3.0,v,nlocal,2,pairlist,fpairlist,dellist);
   }
 }
 
@@ -2189,7 +2197,12 @@ void FixShake::shake4(int m)
     v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda03*r03[0]*r03[2];
     v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda03*r03[1]*r03[2];
 
-    v_tally(nlist,list,4.0,v);
+    double fpairlist[] = {lamda01, lamda02, lamda03};
+    double dellist[][3]  = {{r01[0], r01[1], r01[2]},
+                            {r02[0], r02[1], r02[2]},
+                            {r03[0], r03[1], r03[2]}};
+    int pairlist[][2] = {{i0,i1}, {i0,i2}, {i0,i3}};
+    v_tally(nlist,list,4.0,v,nlocal,3,pairlist,fpairlist,dellist);
   }
 }
 
@@ -2432,7 +2445,12 @@ void FixShake::shake3angle(int m)
     v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda12*r12[0]*r12[2];
     v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda12*r12[1]*r12[2];
 
-    v_tally(nlist,list,3.0,v);
+    double fpairlist[] = {lamda01, lamda02, lamda12};
+    double dellist[][3]  = {{r01[0], r01[1], r01[2]},
+                            {r02[0], r02[1], r02[2]},
+                            {r12[0], r12[1], r12[2]}};
+    int pairlist[][2] = {{i0,i1}, {i0,i2}, {i1,i2}};
+    v_tally(nlist,list,3.0,v,nlocal,3,pairlist,fpairlist,dellist);
   }
 }
 
diff --git a/src/atom.cpp b/src/atom.cpp
index 1c18b5a109..79bf5de23a 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -29,6 +29,7 @@
 #include "modify.h"
 #include "molecule.h"
 #include "neighbor.h"
+#include "tokenizer.h"
 #include "update.h"
 #include "variable.h"
 
@@ -889,10 +890,10 @@ void Atom::tag_check()
   MPI_Allreduce(&min,&minall,1,MPI_LMP_TAGINT,MPI_MIN,world);
   MPI_Allreduce(&max,&maxall,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
-  if (minall < 0) error->all(FLERR,"One or more Atom IDs is negative");
-  if (maxall >= MAXTAGINT) error->all(FLERR,"One or more atom IDs is too big");
+  if (minall < 0) error->all(FLERR,"One or more Atom IDs are negative");
+  if (maxall >= MAXTAGINT) error->all(FLERR,"One or more atom IDs are too big");
   if (maxall > 0 && minall == 0)
-    error->all(FLERR,"One or more atom IDs is zero");
+    error->all(FLERR,"One or more atom IDs are zero");
   if (maxall > 0 && tag_enable == 0)
     error->all(FLERR,"Non-zero atom IDs with atom_modify id = no");
   if (maxall == 0 && natoms && tag_enable)
@@ -1061,6 +1062,7 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
   double *coord;
   char *next;
 
+  // use the first line to detect and validate the number of words/tokens per line
   next = strchr(buf,'\n');
   *next = '\0';
   int nwords = utils::trim_and_count_words(buf);
@@ -1069,8 +1071,6 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
   if (nwords != avec->size_data_atom && nwords != avec->size_data_atom + 3)
     error->all(FLERR,"Incorrect atom format in data file");
 
-  char **values = new char*[nwords];
-
   // set bounds for my proc
   // if periodic and I am lo/hi proc, adjust bounds by EPSILON
   // insures all data atoms will be owned even with round-off
@@ -1141,15 +1141,10 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
-
-    for (m = 0; m < nwords; m++) {
-      buf += strspn(buf," \t\n\r\f");
-      buf[strcspn(buf," \t\n\r\f")] = '\0';
-      if (strlen(buf) == 0)
-        error->all(FLERR,"Incorrect atom format in data file");
-      values[m] = buf;
-      buf += strlen(buf)+1;
-    }
+    *next = '\0';
+    auto values = Tokenizer(utils::trim_comment(buf)).as_vector();
+    if (values.size() != nwords)
+      error->all(FLERR, "Incorrect atom format in data file: {}", utils::trim(buf));
 
     int imx = 0, imy = 0, imz = 0;
     if (imageflag) {
@@ -1196,7 +1191,6 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
 
     buf = next + 1;
   }
-  delete [] values;
 }
 
 /* ----------------------------------------------------------------------
@@ -1208,7 +1202,6 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
 void Atom::data_vels(int n, char *buf, tagint id_offset)
 {
   int j,m;
-  tagint tagdata;
   char *next;
 
   next = strchr(buf,'\n');
@@ -1219,31 +1212,24 @@ void Atom::data_vels(int n, char *buf, tagint id_offset)
   if (nwords != avec->size_data_vel)
     error->all(FLERR,"Incorrect velocity format in data file");
 
-  char **values = new char*[nwords];
-
   // loop over lines of atom velocities
   // tokenize the line into values
   // if I own atom tag, unpack its values
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
+    *next = '\0';
+    auto values = Tokenizer(utils::trim_comment(buf)).as_vector();
+    if (values.size() != nwords)
+      error->all(FLERR, "Incorrect atom format in data file: {}", utils::trim(buf));
 
-    for (j = 0; j < nwords; j++) {
-      buf += strspn(buf," \t\n\r\f");
-      buf[strcspn(buf," \t\n\r\f")] = '\0';
-      values[j] = buf;
-      buf += strlen(buf)+1;
-    }
-
-    tagdata = ATOTAGINT(values[0]) + id_offset;
+    tagint tagdata = utils::tnumeric(FLERR,values[0],false,lmp) + id_offset;
     if (tagdata <= 0 || tagdata > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Velocities section of data file");
-    if ((m = map(tagdata)) >= 0) avec->data_vel(m,&values[1]);
+    if ((m = map(tagdata)) >= 0) avec->data_vel(m,values);
 
     buf = next + 1;
   }
-
-  delete [] values;
 }
 
 /* ----------------------------------------------------------------------
@@ -1256,18 +1242,25 @@ void Atom::data_vels(int n, char *buf, tagint id_offset)
 void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset,
                       int type_offset)
 {
-  int m,tmp,itype,rv;
+  int m,itype;
   tagint atom1,atom2;
   char *next;
   int newton_bond = force->newton_bond;
+  auto location = "Bonds section of data file";
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
-    rv = sscanf(buf,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT,
-                &tmp,&itype,&atom1,&atom2);
-    if (rv != 4)
-      error->one(FLERR,"Incorrect format of Bonds section in data file");
+    try {
+      ValueTokenizer values(utils::trim_comment(buf));
+      values.next_int();
+      itype = values.next_int();
+      atom1 = values.next_tagint();
+      atom2 = values.next_tagint();
+      if (values.has_next()) throw TokenizerException("Too many tokens","");
+    } catch (TokenizerException &e) {
+      error->one(FLERR,"{} in {}: {}", e.what(), location, utils::trim(buf));
+    }
     if (id_offset) {
       atom1 += id_offset;
       atom2 += id_offset;
@@ -1276,9 +1269,9 @@ void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset,
 
     if ((atom1 <= 0) || (atom1 > map_tag_max) ||
         (atom2 <= 0) || (atom2 > map_tag_max) || (atom1 == atom2))
-      error->one(FLERR,"Invalid atom ID in Bonds section of data file");
+      error->one(FLERR,"Invalid atom ID in {}: {}", location, utils::trim(buf));
     if (itype <= 0 || itype > nbondtypes)
-      error->one(FLERR,"Invalid bond type in Bonds section of data file");
+      error->one(FLERR,"Invalid bond type in {}: {}", location, utils::trim(buf));
     if ((m = map(atom1)) >= 0) {
       if (count) count[m]++;
       else {
@@ -1313,18 +1306,26 @@ void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset,
 void Atom::data_angles(int n, char *buf, int *count, tagint id_offset,
                        int type_offset)
 {
-  int m,tmp,itype,rv;
+  int m,itype;
   tagint atom1,atom2,atom3;
   char *next;
   int newton_bond = force->newton_bond;
+  auto location = "Angles section of data file";
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
-    rv = sscanf(buf,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
-                &tmp,&itype,&atom1,&atom2,&atom3);
-    if (rv != 5)
-      error->one(FLERR,"Incorrect format of Angles section in data file");
+    try {
+      ValueTokenizer values(utils::trim_comment(buf));
+      values.next_int();
+      itype = values.next_int();
+      atom1 = values.next_tagint();
+      atom2 = values.next_tagint();
+      atom3 = values.next_tagint();
+      if (values.has_next()) throw TokenizerException("Too many tokens","");
+    } catch (TokenizerException &e) {
+      error->one(FLERR,"{} in {}: {}", e.what(), location, utils::trim(buf));
+    }
     if (id_offset) {
       atom1 += id_offset;
       atom2 += id_offset;
@@ -1336,9 +1337,9 @@ void Atom::data_angles(int n, char *buf, int *count, tagint id_offset,
         (atom2 <= 0) || (atom2 > map_tag_max) ||
         (atom3 <= 0) || (atom3 > map_tag_max) ||
         (atom1 == atom2) || (atom1 == atom3) || (atom2 == atom3))
-      error->one(FLERR,"Invalid atom ID in Angles section of data file");
+      error->one(FLERR,"Invalid atom ID in {}: {}", location, utils::trim(buf));
     if (itype <= 0 || itype > nangletypes)
-      error->one(FLERR,"Invalid angle type in Angles section of data file");
+      error->one(FLERR,"Invalid angle type in {}: {}", location, utils::trim(buf));
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
@@ -1385,19 +1386,27 @@ void Atom::data_angles(int n, char *buf, int *count, tagint id_offset,
 void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset,
                           int type_offset)
 {
-  int m,tmp,itype,rv;
+  int m,itype;
   tagint atom1,atom2,atom3,atom4;
   char *next;
   int newton_bond = force->newton_bond;
+  auto location = "Dihedrals section of data file";
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
-    rv = sscanf(buf,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT
-                " " TAGINT_FORMAT " " TAGINT_FORMAT,
-                &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
-    if (rv != 6)
-      error->one(FLERR,"Incorrect format of Dihedrals section in data file");
+    try {
+      ValueTokenizer values(utils::trim_comment(buf));
+      values.next_int();
+      itype = values.next_int();
+      atom1 = values.next_tagint();
+      atom2 = values.next_tagint();
+      atom3 = values.next_tagint();
+      atom4 = values.next_tagint();
+      if (values.has_next()) throw TokenizerException("Too many tokens","");
+    } catch (TokenizerException &e) {
+      error->one(FLERR,"{} in {}: {}", e.what(), location, utils::trim(buf));
+    }
     if (id_offset) {
       atom1 += id_offset;
       atom2 += id_offset;
@@ -1412,10 +1421,9 @@ void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset,
         (atom4 <= 0) || (atom4 > map_tag_max) ||
         (atom1 == atom2) || (atom1 == atom3) || (atom1 == atom4) ||
         (atom2 == atom3) || (atom2 == atom4) || (atom3 == atom4))
-      error->one(FLERR,"Invalid atom ID in Dihedrals section of data file");
+      error->one(FLERR, "Invalid atom ID in {}: {}", location, utils::trim(buf));
     if (itype <= 0 || itype > ndihedraltypes)
-      error->one(FLERR,
-                 "Invalid dihedral type in Dihedrals section of data file");
+      error->one(FLERR, "Invalid dihedral type in {}: {}", location, utils::trim(buf));
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
@@ -1476,19 +1484,27 @@ void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset,
 void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset,
                           int type_offset)
 {
-  int m,tmp,itype,rv;
+  int m,itype;
   tagint atom1,atom2,atom3,atom4;
   char *next;
   int newton_bond = force->newton_bond;
+  auto location = "Impropers section of data file";
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
-    rv = sscanf(buf,"%d %d "
-                TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
-                &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
-    if (rv != 6)
-      error->one(FLERR,"Incorrect format of Impropers section in data file");
+    try {
+      ValueTokenizer values(utils::trim_comment(buf));
+      values.next_int();
+      itype = values.next_int();
+      atom1 = values.next_tagint();
+      atom2 = values.next_tagint();
+      atom3 = values.next_tagint();
+      atom4 = values.next_tagint();
+      if (values.has_next()) throw TokenizerException("Too many tokens","");
+    } catch (TokenizerException &e) {
+      error->one(FLERR,"{} in {}: {}", e.what(), location, utils::trim(buf));
+    }
     if (id_offset) {
       atom1 += id_offset;
       atom2 += id_offset;
@@ -1503,10 +1519,9 @@ void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset,
         (atom4 <= 0) || (atom4 > map_tag_max) ||
         (atom1 == atom2) || (atom1 == atom3) || (atom1 == atom4) ||
         (atom2 == atom3) || (atom2 == atom4) || (atom3 == atom4))
-      error->one(FLERR,"Invalid atom ID in Impropers section of data file");
+      error->one(FLERR, "Invalid atom ID in {}: {}", location, utils::trim(buf));
     if (itype <= 0 || itype > nimpropertypes)
-      error->one(FLERR,
-                 "Invalid improper type in Impropers section of data file");
+      error->one(FLERR, "Invalid improper type in {}: {}", location, utils::trim(buf));
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
@@ -1565,7 +1580,7 @@ void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset,
 
 void Atom::data_bonus(int n, char *buf, AtomVec *avec_bonus, tagint id_offset)
 {
-  int j,m,tagdata;
+  int j,m;
   char *next;
 
   next = strchr(buf,'\n');
@@ -1576,35 +1591,28 @@ void Atom::data_bonus(int n, char *buf, AtomVec *avec_bonus, tagint id_offset)
   if (nwords != avec_bonus->size_data_bonus)
     error->all(FLERR,"Incorrect bonus data format in data file");
 
-  char **values = new char*[nwords];
-
   // loop over lines of bonus atom data
   // tokenize the line into values
   // if I own atom tag, unpack its values
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
+    *next = '\0';
+    auto values = Tokenizer(utils::trim_comment(buf)).as_vector();
+    if (values.size() != nwords)
+      error->all(FLERR, "Incorrect atom format in data file: {}", utils::trim(buf));
 
-    for (j = 0; j < nwords; j++) {
-      buf += strspn(buf," \t\n\r\f");
-      buf[strcspn(buf," \t\n\r\f")] = '\0';
-      values[j] = buf;
-      buf += strlen(buf)+1;
-    }
-
-    tagdata = ATOTAGINT(values[0]) + id_offset;
+    tagint tagdata = utils::tnumeric(FLERR,values[0],false,lmp) + id_offset;
     if (tagdata <= 0 || tagdata > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Bonus section of data file");
 
     // ok to call child's data_atom_bonus() method thru parent avec_bonus,
     // since data_bonus() was called with child ptr, and method is virtual
 
-    if ((m = map(tagdata)) >= 0) avec_bonus->data_atom_bonus(m,&values[1]);
+    if ((m = map(tagdata)) >= 0) avec_bonus->data_atom_bonus(m,values);
 
     buf = next + 1;
   }
-
-  delete [] values;
 }
 
 /* ----------------------------------------------------------------------
@@ -1616,12 +1624,8 @@ void Atom::data_bonus(int n, char *buf, AtomVec *avec_bonus, tagint id_offset)
 
 void Atom::data_bodies(int n, char *buf, AtomVec *avec_body, tagint id_offset)
 {
-  int j,m,nvalues,tagdata,ninteger,ndouble;
-
-  int maxint = 0;
-  int maxdouble = 0;
-  int *ivalues = nullptr;
-  double *dvalues = nullptr;
+  std::vector<int> ivalues;
+  std::vector<double> dvalues;
 
   if (!unique_tags) unique_tags = new std::set<tagint>;
 
@@ -1630,69 +1634,51 @@ void Atom::data_bodies(int n, char *buf, AtomVec *avec_body, tagint id_offset)
   // else skip values
 
   for (int i = 0; i < n; i++) {
-    buf += strspn(buf," \t\n\r\f");
-    buf[strcspn(buf," \t\n\r\f")] = '\0';
-    tagdata = utils::tnumeric(FLERR,buf,false,lmp) + id_offset;
-    buf += strlen(buf)+1;
+    char *next = strchr(buf,'\n');
+    *next = '\0';
 
-    if (tagdata <= 0 || tagdata > map_tag_max)
-      error->one(FLERR,"Invalid atom ID in Bodies section of data file");
+    auto values = Tokenizer(utils::trim_comment(buf)).as_vector();
+    tagint tagdata = utils::tnumeric(FLERR,values[0],false,lmp) + id_offset;
+    int ninteger = utils::inumeric(FLERR,values[1],false,lmp);
+    int ndouble = utils::inumeric(FLERR,values[2],false,lmp);
 
     if (unique_tags->find(tagdata) == unique_tags->end())
       unique_tags->insert(tagdata);
     else
       error->one(FLERR,"Duplicate atom ID in Bodies section of data file");
 
-    buf += strspn(buf," \t\n\r\f");
-    buf[strcspn(buf," \t\n\r\f")] = '\0';
-    ninteger = utils::inumeric(FLERR,buf,false,lmp);
-    buf += strlen(buf)+1;
+    buf = next + 1;
+    int m = map(tagdata);
+    if (m >= 0) {
+      ivalues.resize(ninteger);
+      dvalues.resize(ndouble);
 
-    buf += strspn(buf," \t\n\r\f");
-    buf[strcspn(buf," \t\n\r\f")] = '\0';
-    ndouble = utils::inumeric(FLERR,buf,false,lmp);
-    buf += strlen(buf)+1;
-
-    if ((m = map(tagdata)) >= 0) {
-      if (ninteger > maxint) {
-        delete [] ivalues;
-        maxint = ninteger;
-        ivalues = new int[maxint];
-      }
-      if (ndouble > maxdouble) {
-        delete [] dvalues;
-        maxdouble = ndouble;
-        dvalues = new double[maxdouble];
-      }
-
-      for (j = 0; j < ninteger; j++) {
+      for (int j = 0; j < ninteger; j++) {
         buf += strspn(buf," \t\n\r\f");
         buf[strcspn(buf," \t\n\r\f")] = '\0';
         ivalues[j] = utils::inumeric(FLERR,buf,false,lmp);
         buf += strlen(buf)+1;
       }
 
-      for (j = 0; j < ndouble; j++) {
+      for (int j = 0; j < ndouble; j++) {
         buf += strspn(buf," \t\n\r\f");
         buf[strcspn(buf," \t\n\r\f")] = '\0';
         dvalues[j] = utils::numeric(FLERR,buf,false,lmp);
         buf += strlen(buf)+1;
       }
 
-      avec_body->data_body(m,ninteger,ndouble,ivalues,dvalues);
+      avec_body->data_body(m,ninteger,ndouble,ivalues.data(),dvalues.data());
 
     } else {
-      nvalues = ninteger + ndouble;    // number of values to skip
-      for (j = 0; j < nvalues; j++) {
+      int nvalues = ninteger + ndouble;    // number of values to skip
+      for (int j = 0; j < nvalues; j++) {
         buf += strspn(buf," \t\n\r\f");
         buf[strcspn(buf," \t\n\r\f")] = '\0';
         buf += strlen(buf)+1;
       }
     }
+    buf += strspn(buf," \t\n\r\f");
   }
-
-  delete [] ivalues;
-  delete [] dvalues;
 }
 
 /* ----------------------------------------------------------------------
@@ -1747,17 +1733,20 @@ void Atom::set_mass(const char *file, int line, const char *str, int type_offset
 
   int itype;
   double mass_one;
-  int n = sscanf(str,"%d %lg",&itype,&mass_one);
-  if (n != 2) error->all(file,line,"Invalid mass line in data file");
-  itype += type_offset;
+  try {
+    ValueTokenizer values(utils::trim_comment(str));
+    itype = values.next_int() + type_offset;
+    mass_one = values.next_double();
+    if (values.has_next()) throw TokenizerException("Too many tokens", "");
 
-  if (itype < 1 || itype > ntypes)
-    error->all(file,line,"Invalid type for mass set");
+    if (itype < 1 || itype > ntypes) throw TokenizerException("Invalid atom type", "");
+    if (mass_one <= 0.0) throw TokenizerException("Invalid mass value", "");
+  } catch (TokenizerException &e) {
+    error->all(file,line,"{} in Masses section of data file: {}", e.what(), utils::trim(str));
+  }
 
   mass[itype] = mass_one;
   mass_setflag[itype] = 1;
-
-  if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 5ebaf41ebb..aa646ecabb 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -1707,7 +1707,7 @@ void AtomVec::create_atom(int itype, double *coord)
    initialize other peratom quantities
 ------------------------------------------------------------------------- */
 
-void AtomVec::data_atom(double *coord, imageint imagetmp, char **values)
+void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std::string> &values)
 {
   int m,n,datatype,cols;
   void *pdata;
@@ -1890,18 +1890,18 @@ void AtomVec::write_data(FILE *fp, int n, double **buf)
    unpack one line from Velocities section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVec::data_vel(int ilocal, char **values)
+void AtomVec::data_vel(int ilocal, const std::vector<std::string> &values)
 {
   int m,n,datatype,cols;
   void *pdata;
 
   double **v = atom->v;
-  v[ilocal][0] = utils::numeric(FLERR,values[0],true,lmp);
-  v[ilocal][1] = utils::numeric(FLERR,values[1],true,lmp);
-  v[ilocal][2] = utils::numeric(FLERR,values[2],true,lmp);
+  int ivalue = 1;
+  v[ilocal][0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  v[ilocal][1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  v[ilocal][2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
 
   if (ndata_vel > 2) {
-    int ivalue = 3;
     for (n = 2; n < ndata_vel; n++) {
       pdata = mdata_vel.pdata[n];
       datatype = mdata_vel.datatype[n];
diff --git a/src/atom_vec.h b/src/atom_vec.h
index 0a86d1122c..6bc7f23532 100644
--- a/src/atom_vec.h
+++ b/src/atom_vec.h
@@ -124,9 +124,9 @@ class AtomVec : protected Pointers {
   virtual void create_atom(int, double *);
   virtual void create_atom_post(int) {}
 
-  virtual void data_atom(double *, imageint, char **);
+  virtual void data_atom(double *, imageint, const std::vector<std::string> &);
   virtual void data_atom_post(int) {}
-  virtual void data_atom_bonus(int, char **) {}
+  virtual void data_atom_bonus(int, const std::vector<std::string> &) {}
   virtual void data_body(int, int, int, int *, double *) {}
 
   virtual void data_bonds_post(int, int, tagint, tagint, tagint) {}
@@ -136,7 +136,7 @@ class AtomVec : protected Pointers {
   virtual void pack_data_pre(int) {}
   virtual void pack_data_post(int) {}
 
-  virtual void data_vel(int, char **);
+  virtual void data_vel(int, const std::vector<std::string> &);
   virtual void pack_vel(double **);
   virtual void write_vel(FILE *, int, double **);
 
@@ -166,8 +166,8 @@ class AtomVec : protected Pointers {
   virtual int unpack_reverse_hybrid(int, int *, double *) { return 0; }
   virtual int pack_border_hybrid(int, int *, double *) { return 0; }
   virtual int unpack_border_hybrid(int, int, double *) { return 0; }
-  virtual int data_atom_hybrid(int, char **) { return 0; }
-  virtual int data_vel_hybrid(int, char **) { return 0; }
+  virtual int data_atom_hybrid(int, const std::vector<std::string> &, int) { return 0; }
+  virtual int data_vel_hybrid(int, const std::vector<std::string> &, int) { return 0; }
   virtual int pack_data_hybrid(int, double *) { return 0; }
   virtual int write_data_hybrid(FILE *, double *) { return 0; }
   virtual int pack_vel_hybrid(int, double *) { return 0; }
diff --git a/src/atom_vec_ellipsoid.cpp b/src/atom_vec_ellipsoid.cpp
index 38c4893f61..6c39e4cfaf 100644
--- a/src/atom_vec_ellipsoid.cpp
+++ b/src/atom_vec_ellipsoid.cpp
@@ -381,7 +381,7 @@ int AtomVecEllipsoid::unpack_restart_bonus(int ilocal, double *buf)
    unpack one line from Ellipsoids section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVecEllipsoid::data_atom_bonus(int m, char **values)
+void AtomVecEllipsoid::data_atom_bonus(int m, const std::vector<std::string> & values)
 {
   if (ellipsoid[m])
     error->one(FLERR,"Assigning ellipsoid parameters to non-ellipsoid atom");
@@ -389,17 +389,18 @@ void AtomVecEllipsoid::data_atom_bonus(int m, char **values)
   if (nlocal_bonus == nmax_bonus) grow_bonus();
 
   double *shape = bonus[nlocal_bonus].shape;
-  shape[0] = 0.5 * utils::numeric(FLERR,values[0],true,lmp);
-  shape[1] = 0.5 * utils::numeric(FLERR,values[1],true,lmp);
-  shape[2] = 0.5 * utils::numeric(FLERR,values[2],true,lmp);
+  int ivalue = 1;
+  shape[0] = 0.5 * utils::numeric(FLERR,values[ivalue++],true,lmp);
+  shape[1] = 0.5 * utils::numeric(FLERR,values[ivalue++],true,lmp);
+  shape[2] = 0.5 * utils::numeric(FLERR,values[ivalue++],true,lmp);
   if (shape[0] <= 0.0 || shape[1] <= 0.0 || shape[2] <= 0.0)
     error->one(FLERR,"Invalid shape in Ellipsoids section of data file");
 
   double *quat = bonus[nlocal_bonus].quat;
-  quat[0] = utils::numeric(FLERR,values[3],true,lmp);
-  quat[1] = utils::numeric(FLERR,values[4],true,lmp);
-  quat[2] = utils::numeric(FLERR,values[5],true,lmp);
-  quat[3] = utils::numeric(FLERR,values[6],true,lmp);
+  quat[0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  quat[1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  quat[2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  quat[3] = utils::numeric(FLERR,values[ivalue++],true,lmp);
   MathExtra::qnormalize(quat);
 
   // reset ellipsoid mass
diff --git a/src/atom_vec_ellipsoid.h b/src/atom_vec_ellipsoid.h
index 6a77d9886c..bb50944f30 100644
--- a/src/atom_vec_ellipsoid.h
+++ b/src/atom_vec_ellipsoid.h
@@ -48,7 +48,7 @@ class AtomVecEllipsoid : public AtomVec {
   int size_restart_bonus();
   int pack_restart_bonus(int, double *);
   int unpack_restart_bonus(int, double *);
-  void data_atom_bonus(int, char **);
+  void data_atom_bonus(int, const std::vector<std::string> &);
   double memory_usage_bonus();
 
   void create_atom_post(int);
diff --git a/src/atom_vec_line.cpp b/src/atom_vec_line.cpp
index bfc6c7e6d7..e15bc61f2f 100644
--- a/src/atom_vec_line.cpp
+++ b/src/atom_vec_line.cpp
@@ -337,16 +337,17 @@ int AtomVecLine::unpack_restart_bonus(int ilocal, double *buf)
    unpack one line from Lines section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVecLine::data_atom_bonus(int m, char **values)
+void AtomVecLine::data_atom_bonus(int m, const std::vector<std::string> &values)
 {
   if (line[m]) error->one(FLERR,"Assigning line parameters to non-line atom");
 
   if (nlocal_bonus == nmax_bonus) grow_bonus();
 
-  double x1 = utils::numeric(FLERR,values[0],true,lmp);
-  double y1 = utils::numeric(FLERR,values[1],true,lmp);
-  double x2 = utils::numeric(FLERR,values[2],true,lmp);
-  double y2 = utils::numeric(FLERR,values[3],true,lmp);
+  int ivalue = 1;
+  double x1 = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  double y1 = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  double x2 = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  double y2 = utils::numeric(FLERR,values[ivalue++],true,lmp);
   double dx = x2 - x1;
   double dy = y2 - y1;
   double length = sqrt(dx*dx + dy*dy);
diff --git a/src/atom_vec_line.h b/src/atom_vec_line.h
index 176cfc0dd9..5903349700 100644
--- a/src/atom_vec_line.h
+++ b/src/atom_vec_line.h
@@ -48,7 +48,7 @@ class AtomVecLine : public AtomVec {
   int size_restart_bonus();
   int pack_restart_bonus(int, double *);
   int unpack_restart_bonus(int, double *);
-  void data_atom_bonus(int, char **);
+  void data_atom_bonus(int, const std::vector<std::string> &);
   double memory_usage_bonus();
 
   void create_atom_post(int);
diff --git a/src/atom_vec_tri.cpp b/src/atom_vec_tri.cpp
index 6a1cad5f5c..4b1ee0d921 100644
--- a/src/atom_vec_tri.cpp
+++ b/src/atom_vec_tri.cpp
@@ -470,22 +470,23 @@ int AtomVecTri::unpack_restart_bonus(int ilocal, double *buf)
    unpack one line from Tris section of data file
 ------------------------------------------------------------------------- */
 
-void AtomVecTri::data_atom_bonus(int m, char **values)
+void AtomVecTri::data_atom_bonus(int m, const std::vector<std::string> &values)
 {
   if (tri[m]) error->one(FLERR,"Assigning tri parameters to non-tri atom");
 
   if (nlocal_bonus == nmax_bonus) grow_bonus();
 
   double c1[3],c2[3],c3[3];
-  c1[0] = utils::numeric(FLERR,values[0],true,lmp);
-  c1[1] = utils::numeric(FLERR,values[1],true,lmp);
-  c1[2] = utils::numeric(FLERR,values[2],true,lmp);
-  c2[0] = utils::numeric(FLERR,values[3],true,lmp);
-  c2[1] = utils::numeric(FLERR,values[4],true,lmp);
-  c2[2] = utils::numeric(FLERR,values[5],true,lmp);
-  c3[0] = utils::numeric(FLERR,values[6],true,lmp);
-  c3[1] = utils::numeric(FLERR,values[7],true,lmp);
-  c3[2] = utils::numeric(FLERR,values[8],true,lmp);
+  int ivalue = 1;
+  c1[0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c1[1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c1[2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c2[0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c2[1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c2[2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c3[0] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c3[1] = utils::numeric(FLERR,values[ivalue++],true,lmp);
+  c3[2] = utils::numeric(FLERR,values[ivalue++],true,lmp);
 
   // check for duplicate points
 
diff --git a/src/atom_vec_tri.h b/src/atom_vec_tri.h
index 36ec8d6948..f9f0bae386 100644
--- a/src/atom_vec_tri.h
+++ b/src/atom_vec_tri.h
@@ -50,7 +50,7 @@ class AtomVecTri : public AtomVec {
   int size_restart_bonus();
   int pack_restart_bonus(int, double *);
   int unpack_restart_bonus(int, double *);
-  void data_atom_bonus(int, char **);
+  void data_atom_bonus(int, const std::vector<std::string> &);
   double memory_usage_bonus();
 
   void create_atom_post(int);
diff --git a/src/compute_bond_local.cpp b/src/compute_bond_local.cpp
index ab0fa3fb0a..7c60df057f 100644
--- a/src/compute_bond_local.cpp
+++ b/src/compute_bond_local.cpp
@@ -34,7 +34,7 @@ using namespace LAMMPS_NS;
 #define DELTA 10000
 #define EPSILON 1.0e-12
 
-enum{DIST,VELVIB,OMEGA,ENGTRANS,ENGVIB,ENGROT,ENGPOT,FORCE,FX,FY,FZ,VARIABLE};
+enum{DIST,DX,DY,DZ,VELVIB,OMEGA,ENGTRANS,ENGVIB,ENGROT,ENGPOT,FORCE,FX,FY,FZ,VARIABLE};
 
 /* ---------------------------------------------------------------------- */
 
@@ -63,6 +63,9 @@ ComputeBondLocal::ComputeBondLocal(LAMMPS *lmp, int narg, char **arg) :
   int iarg;
   for (iarg = 3; iarg < narg; iarg++) {
     if (strcmp(arg[iarg],"dist") == 0) bstyle[nvalues++] = DIST;
+    else if (strcmp(arg[iarg],"dx") == 0) bstyle[nvalues++] = DX;
+    else if (strcmp(arg[iarg],"dy") == 0) bstyle[nvalues++] = DY;
+    else if (strcmp(arg[iarg],"dz") == 0) bstyle[nvalues++] = DZ;
     else if (strcmp(arg[iarg],"engpot") == 0) bstyle[nvalues++] = ENGPOT;
     else if (strcmp(arg[iarg],"force") == 0) bstyle[nvalues++] = FORCE;
     else if (strcmp(arg[iarg],"fx") == 0) bstyle[nvalues++] = FX;
@@ -384,11 +387,23 @@ int ComputeBondLocal::compute_bonds(int flag)
           if (dstr) input->variable->internal_set(dvar,sqrt(rsq));
         }
 
+        // to make sure dx, dy and dz are always from the lower to the higher id
+        double directionCorrection = tag[atom1] > tag[atom2] ? -1.0 : 1.0;
+
         for (int n = 0; n < nvalues; n++) {
           switch (bstyle[n]) {
           case DIST:
             ptr[n] = sqrt(rsq);
             break;
+          case DX:
+            ptr[n] = dx*directionCorrection;
+            break;
+          case DY:
+            ptr[n] = dy*directionCorrection;
+            break;
+          case DZ:
+            ptr[n] = dz*directionCorrection;
+            break;
           case ENGPOT:
             ptr[n] = engpot;
             break;
diff --git a/src/compute_centroid_stress_atom.cpp b/src/compute_centroid_stress_atom.cpp
index a050c8bb6a..7e52e0a920 100644
--- a/src/compute_centroid_stress_atom.cpp
+++ b/src/compute_centroid_stress_atom.cpp
@@ -16,6 +16,7 @@
 #include "angle.h"
 #include "atom.h"
 #include "bond.h"
+#include "citeme.h"
 #include "comm.h"
 #include "dihedral.h"
 #include "error.h"
@@ -34,6 +35,36 @@ using namespace LAMMPS_NS;
 
 enum { NOBIAS, BIAS };
 
+static const char cite_centroid_angle_improper_dihedral[] =
+    "compute centroid/stress/atom for angles, impropers and dihedrals:\n\n"
+    "@article{PhysRevE.99.051301,\n"
+    " title = {Application of atomic stress to compute heat flux via molecular dynamics for "
+    "systems with many-body interactions},\n"
+    " author = {Surblys, Donatas and Matsubara, Hiroki and Kikugawa, Gota and Ohara, Taku},\n"
+    " journal = {Physical Review E},\n"
+    " volume = {99},\n"
+    " issue = {5},\n"
+    " pages = {051301},\n"
+    " year = {2019},\n"
+    " doi = {10.1103/PhysRevE.99.051301},\n"
+    " url = {https://link.aps.org/doi/10.1103/PhysRevE.99.051301}\n"
+    "}\n\n";
+
+static const char cite_centroid_shake_rigid[] =
+    "compute centroid/stress/atom for constrained dynamics:\n\n"
+    "@article{doi:10.1063/5.0070930,\n"
+    " author = {Surblys, Donatas and Matsubara, Hiroki and Kikugawa, Gota and Ohara, Taku},\n"
+    " journal = {Journal of Applied Physics},\n"
+    " title = {Methodology and meaning of computing heat flux via atomic stress in systems with "
+    "constraint dynamics},\n"
+    " volume = {130},\n"
+    " number = {21},\n"
+    " pages = {215104},\n"
+    " year = {2021},\n"
+    " doi = {10.1063/5.0070930},\n"
+    " url = {https://doi.org/10.1063/5.0070930},\n"
+    "}\n\n";
+
 /* ---------------------------------------------------------------------- */
 
 ComputeCentroidStressAtom::ComputeCentroidStressAtom(LAMMPS *lmp, int narg, char **arg) :
@@ -105,6 +136,12 @@ ComputeCentroidStressAtom::ComputeCentroidStressAtom(LAMMPS *lmp, int narg, char
   }
 
   nmax = 0;
+
+  if (lmp->citeme) {
+    if (angleflag || dihedralflag || improperflag)
+      lmp->citeme->add(cite_centroid_angle_improper_dihedral);
+    if (fixflag) lmp->citeme->add(cite_centroid_shake_rigid);
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -268,19 +305,26 @@ void ComputeCentroidStressAtom::compute_peratom()
   // possible during setup phase if fix has not initialized its vatom yet
   // e.g. fix ave/spatial defined before fix shake,
   //   and fix ave/spatial uses a per-atom stress from this compute as input
-  // fix styles are CENTROID_SAME or CENTROID_NOTAVAIL
+  // fix styles are CENTROID_SAME, CENTROID_AVAIL or CENTROID_NOTAVAIL
 
   if (fixflag) {
     Fix **fix = modify->fix;
     int nfix = modify->nfix;
     for (int ifix = 0; ifix < nfix; ifix++)
       if (fix[ifix]->virial_peratom_flag && fix[ifix]->thermo_virial) {
-        double **vatom = fix[ifix]->vatom;
-        if (vatom)
-          for (i = 0; i < nlocal; i++) {
-            for (j = 0; j < 6; j++) stress[i][j] += vatom[i][j];
-            for (j = 6; j < 9; j++) stress[i][j] += vatom[i][j - 3];
-          }
+        if (modify->fix[ifix]->centroidstressflag == CENTROID_AVAIL) {
+          double **cvatom = modify->fix[ifix]->cvatom;
+          if (cvatom)
+            for (i = 0; i < nlocal; i++)
+              for (j = 0; j < 9; j++) stress[i][j] += cvatom[i][j];
+        } else {
+          double **vatom = modify->fix[ifix]->vatom;
+          if (vatom)
+            for (i = 0; i < nlocal; i++) {
+              for (j = 0; j < 6; j++) stress[i][j] += vatom[i][j];
+              for (j = 6; j < 9; j++) stress[i][j] += vatom[i][j - 3];
+            }
+        }
       }
   }
 
diff --git a/src/compute_pair_local.cpp b/src/compute_pair_local.cpp
index af0f20014c..ff9acdc4ef 100644
--- a/src/compute_pair_local.cpp
+++ b/src/compute_pair_local.cpp
@@ -31,7 +31,7 @@ using namespace LAMMPS_NS;
 
 #define DELTA 10000
 
-enum{DIST,ENG,FORCE,FX,FY,FZ,PN};
+enum{DIST,ENG,FORCE,FX,FY,FZ,PN,DX,DY,DZ};
 enum{TYPE,RADIUS};
 
 /* ---------------------------------------------------------------------- */
@@ -56,6 +56,9 @@ ComputePairLocal::ComputePairLocal(LAMMPS *lmp, int narg, char **arg) :
     else if (strcmp(arg[iarg],"fx") == 0) pstyle[nvalues++] = FX;
     else if (strcmp(arg[iarg],"fy") == 0) pstyle[nvalues++] = FY;
     else if (strcmp(arg[iarg],"fz") == 0) pstyle[nvalues++] = FZ;
+    else if (strcmp(arg[iarg],"dx") == 0) pstyle[nvalues++] = DX;
+    else if (strcmp(arg[iarg],"dy") == 0) pstyle[nvalues++] = DY;
+    else if (strcmp(arg[iarg],"dz") == 0) pstyle[nvalues++] = DZ;
     else if (arg[iarg][0] == 'p') {
       int n = atoi(&arg[iarg][1]);
       if (n <= 0) error->all(FLERR,
@@ -92,7 +95,7 @@ ComputePairLocal::ComputePairLocal(LAMMPS *lmp, int narg, char **arg) :
 
   singleflag = 0;
   for (int i = 0; i < nvalues; i++)
-    if (pstyle[i] != DIST) singleflag = 1;
+    if (pstyle[i] != DIST && pstyle[i] != DX && pstyle[i] != DY && pstyle[i] != DZ) singleflag = 1;
 
   if (nvalues == 1) size_local_cols = 0;
   else size_local_cols = nvalues;
@@ -264,11 +267,20 @@ int ComputePairLocal::compute_pairs(int flag)
         if (nvalues == 1) ptr = &vlocal[m];
         else ptr = alocal[m];
 
+        // to make sure dx, dy and dz are always from the lower to the higher id
+        double directionCorrection = itag > jtag ? -1.0 : 1.0;
+
         for (n = 0; n < nvalues; n++) {
           switch (pstyle[n]) {
           case DIST:
             ptr[n] = sqrt(rsq);
             break;
+          case DX:
+            ptr[n] = delx*directionCorrection;
+          case DY:
+            ptr[n] = dely*directionCorrection;
+          case DZ:
+            ptr[n] = delz*directionCorrection;
           case ENG:
             ptr[n] = eng;
             break;
diff --git a/src/delete_atoms.cpp b/src/delete_atoms.cpp
index 55b05e3d98..5e06fc22f3 100644
--- a/src/delete_atoms.cpp
+++ b/src/delete_atoms.cpp
@@ -237,9 +237,9 @@ void DeleteAtoms::delete_region(int narg, char **arg)
 {
   if (narg < 2) error->all(FLERR,"Illegal delete_atoms command");
 
-  int iregion = domain->find_region(arg[1]);
-  if (iregion == -1) error->all(FLERR,"Could not find delete_atoms region ID");
-  domain->regions[iregion]->prematch();
+  auto iregion = domain->get_region_by_id(arg[1]);
+  if (!iregion) error->all(FLERR,"Could not find delete_atoms region ID");
+  iregion->prematch();
 
   options(narg-2,&arg[2]);
 
@@ -252,7 +252,7 @@ void DeleteAtoms::delete_region(int narg, char **arg)
   double **x = atom->x;
 
   for (int i = 0; i < nlocal; i++)
-    if (domain->regions[iregion]->match(x[i][0],x[i][1],x[i][2])) dlist[i] = 1;
+    if (iregion->match(x[i][0],x[i][1],x[i][2])) dlist[i] = 1;
 }
 
 /* ----------------------------------------------------------------------
@@ -422,15 +422,18 @@ void DeleteAtoms::delete_overlap(int narg, char **arg)
 
 void DeleteAtoms::delete_porosity(int narg, char **arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal delete_atoms command");
+  if (narg < 5) error->all(FLERR,"Illegal delete_atoms command");
 
-  int iregion = domain->find_region(arg[1]);
-  if (iregion == -1) error->all(FLERR,"Could not find delete_atoms region ID");
-  domain->regions[iregion]->prematch();
+  int igroup = group->find(arg[1]);
+  if (igroup == -1) error->all(FLERR,"Could not find delete_atoms group ID");
 
-  double porosity_fraction = utils::numeric(FLERR,arg[2],false,lmp);
-  int seed = utils::inumeric(FLERR,arg[3],false,lmp);
-  options(narg-4,&arg[4]);
+  auto iregion = domain->get_region_by_id(arg[2]);
+  if (!iregion && (strcmp(arg[2],"NULL") != 0))
+    error->all(FLERR,"Could not find delete_atoms region ID");
+
+  double porosity_fraction = utils::numeric(FLERR,arg[3],false,lmp);
+  int seed = utils::inumeric(FLERR,arg[4],false,lmp);
+  options(narg-5,&arg[5]);
 
   RanMars *random = new RanMars(lmp,seed + comm->me);
 
@@ -440,11 +443,19 @@ void DeleteAtoms::delete_porosity(int narg, char **arg)
   memory->create(dlist,nlocal,"delete_atoms:dlist");
   for (int i = 0; i < nlocal; i++) dlist[i] = 0;
 
-  double **x = atom->x;
+  // delete fraction of atoms which are in both group and region
 
-  for (int i = 0; i < nlocal; i++)
-    if (domain->regions[iregion]->match(x[i][0],x[i][1],x[i][2]))
-      if (random->uniform() <= porosity_fraction) dlist[i] = 1;
+  double **x = atom->x;
+  int *mask = atom->mask;
+
+  int groupbit = group->bitmask[igroup];
+  if (iregion) iregion->prematch();
+
+  for (int i = 0; i < nlocal; i++) {
+    if (!(mask[i] & groupbit)) continue;
+    if (iregion && !iregion->match(x[i][0],x[i][1],x[i][2])) continue;
+    if (random->uniform() <= porosity_fraction) dlist[i] = 1;
+  }
 
   delete random;
 }
diff --git a/src/dump.cpp b/src/dump.cpp
index 7eca4a938c..a1f905d974 100644
--- a/src/dump.cpp
+++ b/src/dump.cpp
@@ -128,8 +128,7 @@ Dump::Dump(LAMMPS *lmp, int /*narg*/, char **arg) : Pointers(lmp)
   char *ptr;
   if ((ptr = strchr(filename,'%'))) {
     if (strstr(style,"mpiio"))
-      error->all(FLERR,
-                 "Dump file MPI-IO output not allowed with % in filename");
+      error->all(FLERR,"Dump file MPI-IO output not allowed with % in filename");
     multiproc = 1;
     nclusterprocs = 1;
     filewriter = 1;
@@ -228,7 +227,7 @@ void Dump::init()
   if (sort_flag) {
     if (multiproc > 1)
       error->all(FLERR,
-                 "Cannot dump sort when multiple dump files are written");
+                 "Cannot dump sort when 'nfile' or 'fileper' keywords are set to non-default values");
     if (sortcol == 0 && atom->tag_enable == 0)
       error->all(FLERR,"Cannot dump sort on atom IDs with no atom IDs defined");
     if (sortcol && sortcol > size_one)
@@ -237,8 +236,6 @@ void Dump::init()
       irregular = new Irregular(lmp);
 
     bigint size = group->count(igroup);
-    if (size > MAXSMALLINT) error->all(FLERR,"Too many atoms to dump sort");
-    int isize = static_cast<int> (size);
 
     // set reorderflag = 1 if can simply reorder local atoms rather than sort
     // criteria: sorting by ID, atom IDs are consecutive from 1 to Natoms
@@ -268,7 +265,7 @@ void Dump::init()
       MPI_Allreduce(&min,&minall,1,MPI_LMP_TAGINT,MPI_MIN,world);
       MPI_Allreduce(&max,&maxall,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
-      if (maxall-minall+1 == isize) {
+      if (maxall-minall+1 == size) {
         reorderflag = 1;
         double range = maxall-minall + EPSILON;
         idlo = static_cast<tagint> (range*me/nprocs + minall);
@@ -284,7 +281,7 @@ void Dump::init()
         else if (me+1 != hi) idhi++;
 
         nme_reorder = idhi-idlo;
-        ntotal_reorder = isize;
+        ntotal_reorder = size;
       }
     }
   }
@@ -369,16 +366,6 @@ void Dump::write()
   if (multiproc != nprocs) MPI_Allreduce(&nme,&nmax,1,MPI_INT,MPI_MAX,world);
   else nmax = nme;
 
-  // write timestep header
-  // for multiproc,
-  //   nheader = # of lines in this file via Allreduce on clustercomm
-
-  bigint nheader = ntotal;
-  if (multiproc)
-    MPI_Allreduce(&bnme,&nheader,1,MPI_LMP_BIGINT,MPI_SUM,clustercomm);
-
-  if (filewriter && write_header_flag) write_header(nheader);
-
   // insure buf is sized for packing and communicating
   // use nmax to insure filewriter proc can receive info from others
   // limit nmax*size_one to int since used as arg in MPI calls
@@ -431,6 +418,19 @@ void Dump::write()
   else pack(nullptr);
   if (sort_flag) sort();
 
+  // write timestep header
+  // for multiproc,
+  //   nheader = # of lines in this file via Allreduce on clustercomm
+  //   must come after sort, which can change nme
+
+  bigint nheader = ntotal;
+  if (multiproc) {
+    bnme = nme;
+    MPI_Allreduce(&bnme,&nheader,1,MPI_LMP_BIGINT,MPI_SUM,clustercomm);
+  }
+
+  if (filewriter && write_header_flag) write_header(nheader);
+
   // if buffering, convert doubles into strings
   // insure sbuf is sized for communicating
   // cannot buffer if output is to binary file
@@ -918,11 +918,6 @@ void Dump::modify_params(int narg, char **arg)
       else delay_flag = 0;
       iarg += 2;
 
-    } else if (strcmp(arg[iarg],"header") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
-      write_header_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
-      iarg += 2;
-
     } else if (strcmp(arg[iarg],"every") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       int idump;
@@ -937,9 +932,28 @@ void Dump::modify_params(int narg, char **arg)
         n = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
         if (n <= 0) error->all(FLERR,"Illegal dump_modify command");
       }
+      output->mode_dump[idump] = 0;
       output->every_dump[idump] = n;
       iarg += 2;
 
+    } else if (strcmp(arg[iarg],"every/time") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
+      int idump;
+      for (idump = 0; idump < output->ndump; idump++)
+        if (strcmp(id,output->dump[idump]->id) == 0) break;
+      double delta;
+      if (strstr(arg[iarg+1],"v_") == arg[iarg+1]) {
+        delete [] output->var_dump[idump];
+        output->var_dump[idump] = utils::strdup(&arg[iarg+1][2]);
+        delta = 0.0;
+      } else {
+        delta = utils::numeric(FLERR,arg[iarg+1],false,lmp);
+        if (delta <= 0.0) error->all(FLERR,"Illegal dump_modify command");
+      }
+      output->mode_dump[idump] = 1;
+      output->every_time_dump[idump] = delta;
+      iarg += 2;
+
     } else if (strcmp(arg[iarg],"fileper") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multiproc)
@@ -1007,6 +1021,11 @@ void Dump::modify_params(int narg, char **arg)
         iarg += n;
       }
 
+    } else if (strcmp(arg[iarg],"header") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
+      write_header_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
+
     } else if (strcmp(arg[iarg],"maxfiles") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multifile)
@@ -1113,6 +1132,7 @@ double Dump::compute_time()
 {
   return update->atime + (update->ntimestep - update->atimestep)*update->dt;
 }
+
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory
 ------------------------------------------------------------------------- */
diff --git a/src/dump.h b/src/dump.h
index 730d4c9ca9..35da154d7c 100644
--- a/src/dump.h
+++ b/src/dump.h
@@ -26,7 +26,7 @@ class Dump : protected Pointers {
   int igroup, groupbit;    // group that Dump is performed on
 
   int first_flag;    // 0 if no initial dump, 1 if yes initial dump
-  int clearstep;     // 1 if dump invokes computes, 0 if not
+  int clearstep;     // 1 if dump can invoke computes, 0 if not
 
   int comm_forward;    // size of forward communication (0 if none)
   int comm_reverse;    // size of reverse communication (0 if none)
@@ -75,7 +75,7 @@ class Dump : protected Pointers {
   int sortcol;              // 0 to sort on ID, 1-N on columns
   int sortcolm1;            // sortcol - 1
   int sortorder;            // ASCEND or DESCEND
-  int time_flag;            // 1 if output accumulated time
+  int time_flag;            // 1 if output simulation time
   int unit_flag;            // 1 if dump should contain unit information
   int unit_count;           // # of times the unit information was written
   int delay_flag;           // 1 if delay output until delaystep
@@ -116,7 +116,7 @@ class Dump : protected Pointers {
 
   bigint ntotal;         // total # of per-atom lines in snapshot
   int reorderflag;       // 1 if OK to reorder instead of sort
-  int ntotal_reorder;    // # of atoms that must be in snapshot
+  bigint ntotal_reorder;    // # of atoms that must be in snapshot
   int nme_reorder;       // # of atoms I must own in snapshot
   tagint idlo;           // lowest ID I own when reordering
 
@@ -173,10 +173,9 @@ E: Dump file MPI-IO output not allowed with % in filename
 This is because a % signifies one file per processor and MPI-IO
 creates one large file for all processors.
 
-E: Cannot dump sort when multiple dump files are written
+E: Cannot dump sort when 'nfile' or 'fileper' keywords are set to non-default values
 
-In this mode, each processor dumps its atoms to a file, so
-no sorting is allowed.
+Can only dump sort when the number of dump file pieces using % in filename equals the number of processors
 
 E: Cannot dump sort on atom IDs with no atom IDs defined
 
@@ -186,10 +185,6 @@ E: Dump sort column is invalid
 
 Self-explanatory.
 
-E: Too many atoms to dump sort
-
-Cannot sort when running with more than 2^31 atoms.
-
 E: Dump could not find refresh compute ID
 
 UNDOCUMENTED
diff --git a/src/dump_xyz.cpp b/src/dump_xyz.cpp
index e009937959..e90243fd1f 100644
--- a/src/dump_xyz.cpp
+++ b/src/dump_xyz.cpp
@@ -130,8 +130,10 @@ int DumpXYZ::modify_param(int narg, char **arg)
 void DumpXYZ::write_header(bigint n)
 {
   if (me == 0) {
-    fprintf(fp,BIGINT_FORMAT "\n",n);
-    fprintf(fp,"Atoms. Timestep: " BIGINT_FORMAT "\n",update->ntimestep);
+    if (time_flag) {
+      double tcurrent = update->atime + (update->ntimestep-update->atimestep) + update->dt;
+      fmt::print(fp,"{}\n Atoms. Timestep: {} Time: {:.6f}\n", n, update->ntimestep, tcurrent);
+    } else fmt::print(fp,"{}\n Atoms. Timestep: {}\n", n, update->ntimestep);
   }
 }
 
@@ -159,7 +161,6 @@ void DumpXYZ::pack(tagint *ids)
     }
 }
 
-
 /* ----------------------------------------------------------------------
    convert mybuf of doubles to one big formatted string in sbuf
    return -1 if strlen exceeds an int, since used as arg in MPI calls in Dump
diff --git a/src/error.cpp b/src/error.cpp
index 5338f41cca..e2162cf661 100644
--- a/src/error.cpp
+++ b/src/error.cpp
@@ -196,7 +196,7 @@ void Error::one(const std::string &file, int line, const std::string &str)
   MPI_Comm_rank(world,&me);
 
   if (input && input->line) lastcmd = input->line;
-  std::string mesg = fmt::format("ERROR on proc {}: {} ({}:{})\n",
+  std::string mesg = fmt::format("ERROR on proc {}: {} ({}:{})\nLast command: {}\n",
                                  me,str,truncpath(file),line,lastcmd);
   utils::logmesg(lmp,mesg);
 
diff --git a/src/fix.cpp b/src/fix.cpp
index 996cd9b7d5..83e0650483 100644
--- a/src/fix.cpp
+++ b/src/fix.cpp
@@ -35,7 +35,8 @@ int Fix::instance_total = 0;
 Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) :
   Pointers(lmp),
   id(nullptr), style(nullptr), extlist(nullptr), vector_atom(nullptr), array_atom(nullptr),
-  vector_local(nullptr), array_local(nullptr), eatom(nullptr), vatom(nullptr)
+  vector_local(nullptr), array_local(nullptr), eatom(nullptr), vatom(nullptr),
+  cvatom(nullptr)
 {
   instance_me = instance_total++;
 
@@ -97,8 +98,8 @@ Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) :
   // set vflag_atom = 0 b/c some fixes grow vatom in grow_arrays()
   //   which may occur outside of timestepping
 
-  maxeatom = maxvatom = 0;
-  vflag_atom = 0;
+  maxeatom = maxvatom = maxcvatom = 0;
+  vflag_atom = cvflag_atom = 0;
   centroidstressflag = CENTROID_SAME;
 
   // KOKKOS per-fix data masks
@@ -122,6 +123,7 @@ Fix::~Fix()
   delete [] style;
   memory->destroy(eatom);
   memory->destroy(vatom);
+  memory->destroy(cvatom);
 }
 
 /* ----------------------------------------------------------------------
@@ -197,7 +199,13 @@ void Fix::ev_setup(int eflag, int vflag)
   else {
     vflag_either = vflag;
     vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
-    vflag_atom = vflag & (VIRIAL_ATOM | VIRIAL_CENTROID);
+    if (centroidstressflag != CENTROID_AVAIL) {
+      vflag_atom = vflag & (VIRIAL_ATOM | VIRIAL_CENTROID);
+      cvflag_atom = 0;
+    } else {
+      vflag_atom = vflag & VIRIAL_ATOM;
+      cvflag_atom = vflag & VIRIAL_CENTROID;
+    }
   }
 
   // reallocate per-atom arrays if necessary
@@ -212,6 +220,11 @@ void Fix::ev_setup(int eflag, int vflag)
     memory->destroy(vatom);
     memory->create(vatom,maxvatom,6,"fix:vatom");
   }
+  if (cvflag_atom && atom->nlocal > maxcvatom) {
+    maxcvatom = atom->nmax;
+    memory->destroy(cvatom);
+    memory->create(cvatom,maxcvatom,9,"fix:cvatom");
+  }
 
   // zero accumulators
   // no global energy variable to zero (unlike pair,bond,angle,etc)
@@ -233,6 +246,20 @@ void Fix::ev_setup(int eflag, int vflag)
       vatom[i][5] = 0.0;
     }
   }
+  if (cvflag_atom) {
+    n = atom->nlocal;
+    for (i = 0; i < n; i++) {
+      cvatom[i][0] = 0.0;
+      cvatom[i][1] = 0.0;
+      cvatom[i][2] = 0.0;
+      cvatom[i][3] = 0.0;
+      cvatom[i][4] = 0.0;
+      cvatom[i][5] = 0.0;
+      cvatom[i][6] = 0.0;
+      cvatom[i][7] = 0.0;
+      cvatom[i][8] = 0.0;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -248,7 +275,13 @@ void Fix::v_setup(int vflag)
 
   evflag = 1;
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
-  vflag_atom = vflag & (VIRIAL_ATOM | VIRIAL_CENTROID);
+  if (centroidstressflag != CENTROID_AVAIL) {
+    vflag_atom = vflag & (VIRIAL_ATOM | VIRIAL_CENTROID);
+    cvflag_atom = 0;
+  } else {
+    vflag_atom = vflag & VIRIAL_ATOM;
+    cvflag_atom = vflag & VIRIAL_CENTROID;
+  }
 
   // reallocate per-atom array if necessary
 
@@ -257,6 +290,11 @@ void Fix::v_setup(int vflag)
     memory->destroy(vatom);
     memory->create(vatom,maxvatom,6,"fix:vatom");
   }
+  if (cvflag_atom && atom->nlocal > maxcvatom) {
+    maxcvatom = atom->nmax;
+    memory->destroy(cvatom);
+    memory->create(cvatom,maxcvatom,9,"fix:cvatom");
+  }
 
   // zero accumulators
 
@@ -272,6 +310,20 @@ void Fix::v_setup(int vflag)
       vatom[i][5] = 0.0;
     }
   }
+  if (cvflag_atom) {
+    n = atom->nlocal;
+    for (i = 0; i < n; i++) {
+      cvatom[i][0] = 0.0;
+      cvatom[i][1] = 0.0;
+      cvatom[i][2] = 0.0;
+      cvatom[i][3] = 0.0;
+      cvatom[i][4] = 0.0;
+      cvatom[i][5] = 0.0;
+      cvatom[i][6] = 0.0;
+      cvatom[i][7] = 0.0;
+      cvatom[i][8] = 0.0;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -338,6 +390,110 @@ void Fix::v_tally(int n, int *list, double total, double *v)
   }
 }
 
+/* ----------------------------------------------------------------------
+   tally virial into global and per-atom accumulators
+   n = # of local owned atoms involved, with local indices in list
+   vtot = total virial for the interaction involving total atoms
+   rlist = list of positional vectors
+   flist = list of force vectors
+   center = centroid coordinate
+   increment global virial by n/total fraction
+   increment per-atom virial of each atom in list by 1/total fraction
+   add centroid form atomic virial contribution for each atom if available
+   this method can be used when fix computes forces in post_force()
+   and only total forces on each atom in group are easily available
+     e.g. fix rigid/small: compute virial only on owned atoms
+       whether newton_bond is on or off
+     other procs will tally left-over fractions for atoms they own
+------------------------------------------------------------------------- */
+
+void Fix::v_tally(int n, int *list, double total, double *vtot,
+    double rlist[][3], double flist[][3], double center[])
+{
+
+  v_tally(n, list, total, vtot);
+
+  if (cvflag_atom) {
+    for (int i = 0; i< n; i++) {
+      const double ri0[3] = {
+        rlist[i][0]-center[0],
+        rlist[i][1]-center[1],
+        rlist[i][2]-center[2],
+      };
+      cvatom[list[i]][0] += ri0[0]*flist[i][0];
+      cvatom[list[i]][1] += ri0[1]*flist[i][1];
+      cvatom[list[i]][2] += ri0[2]*flist[i][2];
+      cvatom[list[i]][3] += ri0[0]*flist[i][1];
+      cvatom[list[i]][4] += ri0[0]*flist[i][2];
+      cvatom[list[i]][5] += ri0[1]*flist[i][2];
+      cvatom[list[i]][6] += ri0[1]*flist[i][0];
+      cvatom[list[i]][7] += ri0[2]*flist[i][0];
+      cvatom[list[i]][8] += ri0[2]*flist[i][1];
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   tally virial into global and per-atom accumulators
+   n = # of local owned atoms involved, with local indices in list
+   vtot = total virial for the interaction involving total atoms
+   npair = # of atom pairs with forces beween them
+   pairlist = indice list of pairs
+   fpairlist = forces between pairs
+   dellist = displacement vectors between pairs
+   increment global virial by n/total fraction
+   increment per-atom virial of each atom in list by 1/total fraction
+   add centroid form atomic virial contribution for each atom if available
+   this method can be used when fix computes forces in post_force()
+     e.g. fix shake, fix rigid: compute virial only on owned atoms
+       whether newton_bond is on or off
+     other procs will tally left-over fractions for atoms they own
+------------------------------------------------------------------------- */
+
+void Fix::v_tally(int n, int *list, double total, double *vtot, int nlocal,
+    int npair, int pairlist[][2], double *fpairlist, double dellist[][3])
+{
+
+  v_tally(n, list, total, vtot);
+
+  if (cvflag_atom) {
+    double v[6];
+    for (int i = 0; i < npair; i++) {
+      v[0] = 0.5*dellist[i][0]*dellist[i][0]*fpairlist[i];
+      v[1] = 0.5*dellist[i][1]*dellist[i][1]*fpairlist[i];
+      v[2] = 0.5*dellist[i][2]*dellist[i][2]*fpairlist[i];
+      v[3] = 0.5*dellist[i][0]*dellist[i][1]*fpairlist[i];
+      v[4] = 0.5*dellist[i][0]*dellist[i][2]*fpairlist[i];
+      v[5] = 0.5*dellist[i][1]*dellist[i][2]*fpairlist[i];
+      const int i0 = pairlist[i][0];
+      const int i1 = pairlist[i][1];
+      if (i0 < nlocal) {
+        cvatom[i0][0] += v[0];
+        cvatom[i0][1] += v[1];
+        cvatom[i0][2] += v[2];
+        cvatom[i0][3] += v[3];
+        cvatom[i0][4] += v[4];
+        cvatom[i0][5] += v[5];
+        cvatom[i0][6] += v[3];
+        cvatom[i0][7] += v[4];
+        cvatom[i0][8] += v[5];
+      }
+      if (i1 < nlocal) {
+        cvatom[i1][0] += v[0];
+        cvatom[i1][1] += v[1];
+        cvatom[i1][2] += v[2];
+        cvatom[i1][3] += v[3];
+        cvatom[i1][4] += v[4];
+        cvatom[i1][5] += v[5];
+        cvatom[i1][6] += v[3];
+        cvatom[i1][7] += v[4];
+        cvatom[i1][8] += v[5];
+      }
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    tally virial into global and per-atom accumulators
    i = local index of atom
diff --git a/src/fix.h b/src/fix.h
index 69fff154dc..339da03734 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -113,6 +113,7 @@ class Fix : protected Pointers {
 
   double virial[6];          // virial for this timestep
   double *eatom, **vatom;    // per-atom energy/virial for this timestep
+  double **cvatom;           // per-atom centroid virial for this timestep
 
   int centroidstressflag;    // centroid stress compared to two-body stress
                              // CENTROID_SAME = same as two-body stress
@@ -249,8 +250,8 @@ class Fix : protected Pointers {
 
   int evflag;
   int eflag_either, eflag_global, eflag_atom;
-  int vflag_either, vflag_global, vflag_atom;
-  int maxeatom, maxvatom;
+  int vflag_either, vflag_global, vflag_atom, cvflag_atom;
+  int maxeatom, maxvatom, maxcvatom;
 
   int copymode;    // if set, do not deallocate during destruction
                    // required when classes are used as functors by Kokkos
@@ -263,7 +264,7 @@ class Fix : protected Pointers {
       ev_setup(eflag, vflag);
     else
       evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          0;
+          cvflag_atom = 0;
   }
   void ev_setup(int, int);
   void ev_tally(int, int *, double, double, double *);
@@ -273,10 +274,12 @@ class Fix : protected Pointers {
     if (vflag && thermo_virial)
       v_setup(vflag);
     else
-      evflag = vflag_either = vflag_global = vflag_atom = 0;
+      evflag = vflag_either = vflag_global = vflag_atom = cvflag_atom = 0;
   }
   void v_setup(int);
   void v_tally(int, int *, double, double *);
+  void v_tally(int,int*,double,double*,int,int,int[][2],double*,double[][3]);
+  void v_tally(int,int*,double,double*,double[][3],double[][3],double[]);
   void v_tally(int, double *);
   void v_tally(int, int, double);
 };
diff --git a/src/fix_ave_atom.cpp b/src/fix_ave_atom.cpp
index 1e30761274..a5661b1f52 100644
--- a/src/fix_ave_atom.cpp
+++ b/src/fix_ave_atom.cpp
@@ -41,6 +41,7 @@ FixAveAtom::FixAveAtom(LAMMPS *lmp, int narg, char **arg) :
   nevery = utils::inumeric(FLERR,arg[3],false,lmp);
   nrepeat = utils::inumeric(FLERR,arg[4],false,lmp);
   peratom_freq = utils::inumeric(FLERR,arg[5],false,lmp);
+  time_depend = 1;
 
   nvalues = narg - 6;
 
@@ -278,11 +279,8 @@ void FixAveAtom::end_of_step()
   int i,j,m,n;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/atom");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_ave_chunk.cpp b/src/fix_ave_chunk.cpp
index 36c9eaf6fc..b6cb17b8c9 100644
--- a/src/fix_ave_chunk.cpp
+++ b/src/fix_ave_chunk.cpp
@@ -61,6 +61,7 @@ FixAveChunk::FixAveChunk(LAMMPS *lmp, int narg, char **arg) :
 
   global_freq = nfreq;
   no_change_box = 1;
+  time_depend = 1;
 
   char * group = arg[1];
 
@@ -541,11 +542,8 @@ void FixAveChunk::end_of_step()
   int i,j,m,n,index;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/chunk");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_ave_correlate.cpp b/src/fix_ave_correlate.cpp
index df6e33288e..486b991b77 100644
--- a/src/fix_ave_correlate.cpp
+++ b/src/fix_ave_correlate.cpp
@@ -52,6 +52,7 @@ FixAveCorrelate::FixAveCorrelate(LAMMPS * lmp, int narg, char **arg):
   nrepeat = utils::inumeric(FLERR,arg[4],false,lmp);
   nfreq = utils::inumeric(FLERR,arg[5],false,lmp);
 
+  time_depend = 1;
   global_freq = nfreq;
 
   // expand args if any have wildcard character "*"
@@ -390,11 +391,8 @@ void FixAveCorrelate::end_of_step()
   double scalar;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/correlate");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_ave_histo.cpp b/src/fix_ave_histo.cpp
index 8c55337074..5a7c6d4ccb 100644
--- a/src/fix_ave_histo.cpp
+++ b/src/fix_ave_histo.cpp
@@ -61,6 +61,7 @@ FixAveHisto::FixAveHisto(LAMMPS *lmp, int narg, char **arg) :
   size_array_cols = 3;
   extarray = 0;
   dynamic_group_allow = 1;
+  time_depend = 1;
 
   lo = utils::numeric(FLERR,arg[6],false,lmp);
   hi = utils::numeric(FLERR,arg[7],false,lmp);
@@ -579,11 +580,8 @@ void FixAveHisto::end_of_step()
   int i,j,m;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/histo");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_ave_histo_weight.cpp b/src/fix_ave_histo_weight.cpp
index 7866490840..16a64e093e 100644
--- a/src/fix_ave_histo_weight.cpp
+++ b/src/fix_ave_histo_weight.cpp
@@ -94,11 +94,8 @@ void FixAveHistoWeight::end_of_step()
   int i,j,m;
 
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/histo");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_ave_time.cpp b/src/fix_ave_time.cpp
index 836121fdc2..98b066ef0b 100644
--- a/src/fix_ave_time.cpp
+++ b/src/fix_ave_time.cpp
@@ -56,6 +56,7 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
   global_freq = nfreq;
 
   dynamic_group_allow = 1;
+  time_depend = 1;
 
   // scan values to count them
   // then read options so know mode = SCALAR/VECTOR before re-reading values
@@ -523,11 +524,8 @@ void FixAveTime::setup(int /*vflag*/)
 void FixAveTime::end_of_step()
 {
   // skip if not step which requires doing something
-  // error check if timestep was reset in an invalid manner
 
   bigint ntimestep = update->ntimestep;
-  if (ntimestep < nvalid_last || ntimestep > nvalid)
-    error->all(FLERR,"Invalid timestep reset for fix ave/time");
   if (ntimestep != nvalid) return;
   nvalid_last = nvalid;
 
diff --git a/src/fix_deposit.cpp b/src/fix_deposit.cpp
index c18a0e23dd..034cecd55a 100644
--- a/src/fix_deposit.cpp
+++ b/src/fix_deposit.cpp
@@ -569,10 +569,12 @@ void FixDeposit::pre_exchange()
     //   coord is new position of geometric center of mol, not COM
     // FixShake::set_molecule stores shake info for molecule
 
-    if (rigidflag)
-      fixrigid->set_molecule(nlocalprev,maxtag_all,imol,coord,vnew,quat);
-    else if (shakeflag)
-      fixshake->set_molecule(nlocalprev,maxtag_all,imol,coord,vnew,quat);
+    if (mode == MOLECULE) {
+      if (rigidflag)
+        fixrigid->set_molecule(nlocalprev,maxtag_all,imol,coord,vnew,quat);
+      else if (shakeflag)
+        fixshake->set_molecule(nlocalprev,maxtag_all,imol,coord,vnew,quat);
+    }
 
     success = 1;
     break;
diff --git a/src/fix_dt_reset.cpp b/src/fix_dt_reset.cpp
index c80c976504..adb0082fc8 100644
--- a/src/fix_dt_reset.cpp
+++ b/src/fix_dt_reset.cpp
@@ -121,14 +121,6 @@ void FixDtReset::init()
   respaflag = 0;
   if (utils::strmatch(update->integrate_style, "^respa")) respaflag = 1;
 
-  // check for DCD or XTC dumps
-
-  for (int i = 0; i < output->ndump; i++)
-    if ((strcmp(output->dump[i]->style, "dcd") == 0 ||
-         strcmp(output->dump[i]->style, "xtc") == 0) &&
-        comm->me == 0)
-      error->warning(FLERR, "Dump dcd/xtc timestamp may be wrong with fix dt/reset");
-
   ftm2v = force->ftm2v;
   mvv2e = force->mvv2e;
   dt = update->dt;
@@ -197,12 +189,16 @@ void FixDtReset::end_of_step()
 
   laststep = update->ntimestep;
 
+  // calls to other classes that need to know timestep size changed
+  // similar logic is in Input::timestep()
+
   update->update_time();
   update->dt = dt;
   update->dt_default = 0;
   if (respaflag) update->integrate->reset_dt();
   if (force->pair) force->pair->reset_dt();
   for (int i = 0; i < modify->nfix; i++) modify->fix[i]->reset_dt();
+  output->reset_dt();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/fix_nve_sphere.cpp b/src/fix_nve_sphere.cpp
index 543b94b46b..5cc9167ed5 100644
--- a/src/fix_nve_sphere.cpp
+++ b/src/fix_nve_sphere.cpp
@@ -294,7 +294,6 @@ void FixNVESphere::final_integrate()
   // update v,omega for all particles
   // d_omega/dt = torque / inertia
 
-  double rke = 0.0;
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       dtfm = dtf / rmass[i];
@@ -306,8 +305,5 @@ void FixNVESphere::final_integrate()
       omega[i][0] += dtirotate * torque[i][0];
       omega[i][1] += dtirotate * torque[i][1];
       omega[i][2] += dtirotate * torque[i][2];
-      rke += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] +
-              omega[i][2]*omega[i][2])*radius[i]*radius[i]*rmass[i];
     }
-
 }
diff --git a/src/fmt/args.h b/src/fmt/args.h
new file mode 100644
index 0000000000..9a8e4ed2ce
--- /dev/null
+++ b/src/fmt/args.h
@@ -0,0 +1,234 @@
+// Formatting library for C++ - dynamic format arguments
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_ARGS_H_
+#define FMT_ARGS_H_
+
+#include <functional>  // std::reference_wrapper
+#include <memory>      // std::unique_ptr
+#include <vector>
+
+#include "core.h"
+
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename T> struct is_reference_wrapper : std::false_type {};
+template <typename T>
+struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T> const T& unwrap(const T& v) { return v; }
+template <typename T> const T& unwrap(const std::reference_wrapper<T>& v) {
+  return static_cast<const T&>(v);
+}
+
+class dynamic_arg_list {
+  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+  // templates it doesn't complain about inability to deduce single translation
+  // unit for placing vtable. So storage_node_base is made a fake template.
+  template <typename = void> struct node {
+    virtual ~node() = default;
+    std::unique_ptr<node<>> next;
+  };
+
+  template <typename T> struct typed_node : node<> {
+    T value;
+
+    template <typename Arg>
+    FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {}
+
+    template <typename Char>
+    FMT_CONSTEXPR typed_node(const basic_string_view<Char>& arg)
+        : value(arg.data(), arg.size()) {}
+  };
+
+  std::unique_ptr<node<>> head_;
+
+ public:
+  template <typename T, typename Arg> const T& push(const Arg& arg) {
+    auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
+    auto& value = new_node->value;
+    new_node->next = std::move(head_);
+    head_ = std::move(new_node);
+    return value;
+  }
+};
+}  // namespace detail
+
+/**
+  \rst
+  A dynamic version of `fmt::format_arg_store`.
+  It's equipped with a storage to potentially temporary objects which lifetimes
+  could be shorter than the format arguments object.
+
+  It can be implicitly converted into `~fmt::basic_format_args` for passing
+  into type-erased formatting functions such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context>
+class dynamic_format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  using char_type = typename Context::char_type;
+
+  template <typename T> struct need_copy {
+    static constexpr detail::type mapped_type =
+        detail::mapped_type_constant<T, Context>::value;
+
+    enum {
+      value = !(detail::is_reference_wrapper<T>::value ||
+                std::is_same<T, basic_string_view<char_type>>::value ||
+                std::is_same<T, detail::std_string_view<char_type>>::value ||
+                (mapped_type != detail::type::cstring_type &&
+                 mapped_type != detail::type::string_type &&
+                 mapped_type != detail::type::custom_type))
+    };
+  };
+
+  template <typename T>
+  using stored_type = conditional_t<detail::is_string<T>::value &&
+                                        !has_formatter<T, Context>::value &&
+                                        !detail::is_reference_wrapper<T>::value,
+                                    std::basic_string<char_type>, T>;
+
+  // Storage of basic_format_arg must be contiguous.
+  std::vector<basic_format_arg<Context>> data_;
+  std::vector<detail::named_arg_info<char_type>> named_info_;
+
+  // Storage of arguments not fitting into basic_format_arg must grow
+  // without relocation because items in data_ refer to it.
+  detail::dynamic_arg_list dynamic_args_;
+
+  friend class basic_format_args<Context>;
+
+  unsigned long long get_types() const {
+    return detail::is_unpacked_bit | data_.size() |
+           (named_info_.empty()
+                ? 0ULL
+                : static_cast<unsigned long long>(detail::has_named_args_bit));
+  }
+
+  const basic_format_arg<Context>* data() const {
+    return named_info_.empty() ? data_.data() : data_.data() + 1;
+  }
+
+  template <typename T> void emplace_arg(const T& arg) {
+    data_.emplace_back(detail::make_arg<Context>(arg));
+  }
+
+  template <typename T>
+  void emplace_arg(const detail::named_arg<char_type, T>& arg) {
+    if (named_info_.empty()) {
+      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
+      data_.insert(data_.begin(), {zero_ptr, 0});
+    }
+    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
+    auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
+      data->pop_back();
+    };
+    std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
+        guard{&data_, pop_one};
+    named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
+    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
+    guard.release();
+  }
+
+ public:
+  constexpr dynamic_format_arg_store() = default;
+
+  /**
+    \rst
+    Adds an argument into the dynamic store for later passing to a formatting
+    function.
+
+    Note that custom types and string types (but not string views) are copied
+    into the store dynamically allocating memory if necessary.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      store.push_back(42);
+      store.push_back("abc");
+      store.push_back(1.5f);
+      std::string result = fmt::vformat("{} and {} and {}", store);
+    \endrst
+  */
+  template <typename T> void push_back(const T& arg) {
+    if (detail::const_check(need_copy<T>::value))
+      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
+    else
+      emplace_arg(detail::unwrap(arg));
+  }
+
+  /**
+    \rst
+    Adds a reference to the argument into the dynamic store for later passing to
+    a formatting function.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      char band[] = "Rolling Stones";
+      store.push_back(std::cref(band));
+      band[9] = 'c'; // Changing str affects the output.
+      std::string result = fmt::vformat("{}", store);
+      // result == "Rolling Scones"
+    \endrst
+  */
+  template <typename T> void push_back(std::reference_wrapper<T> arg) {
+    static_assert(
+        need_copy<T>::value,
+        "objects of built-in types and string views are always copied");
+    emplace_arg(arg.get());
+  }
+
+  /**
+    Adds named argument into the dynamic store for later passing to a formatting
+    function. ``std::reference_wrapper`` is supported to avoid copying of the
+    argument. The name is always copied into the store.
+  */
+  template <typename T>
+  void push_back(const detail::named_arg<char_type, T>& arg) {
+    const char_type* arg_name =
+        dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
+    if (detail::const_check(need_copy<T>::value)) {
+      emplace_arg(
+          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
+    } else {
+      emplace_arg(fmt::arg(arg_name, arg.value));
+    }
+  }
+
+  /** Erase all elements from the store */
+  void clear() {
+    data_.clear();
+    named_info_.clear();
+    dynamic_args_ = detail::dynamic_arg_list();
+  }
+
+  /**
+    \rst
+    Reserves space to store at least *new_cap* arguments including
+    *new_cap_named* named arguments.
+    \endrst
+  */
+  void reserve(size_t new_cap, size_t new_cap_named) {
+    FMT_ASSERT(new_cap >= new_cap_named,
+               "Set of arguments includes set of named arguments");
+    data_.reserve(new_cap);
+    named_info_.reserve(new_cap_named);
+  }
+};
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_ARGS_H_
diff --git a/src/fmt/chrono.h b/src/fmt/chrono.h
index 1a3b8d5e5c..908999ab5f 100644
--- a/src/fmt/chrono.h
+++ b/src/fmt/chrono.h
@@ -8,16 +8,32 @@
 #ifndef FMT_CHRONO_H_
 #define FMT_CHRONO_H_
 
+#include <algorithm>
 #include <chrono>
 #include <ctime>
+#include <iterator>
 #include <locale>
-#include <sstream>
+#include <ostream>
+#include <type_traits>
 
 #include "format.h"
-#include "locale.h"
 
 FMT_BEGIN_NAMESPACE
 
+// Enable tzset.
+#ifndef FMT_USE_TZSET
+// UWP doesn't provide _tzset.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if defined(_WIN32) && (!defined(WINAPI_FAMILY) || \
+                          (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    define FMT_USE_TZSET 1
+#  else
+#    define FMT_USE_TZSET 0
+#  endif
+#endif
+
 // Enable safe chrono durations, unless explicitly disabled.
 #ifndef FMT_SAFE_DURATION_CAST
 #  define FMT_SAFE_DURATION_CAST 1
@@ -44,7 +60,7 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
   static_assert(T::is_integer, "To must be integral");
 
   // A and B are both signed, or both unsigned.
-  if (F::digits <= T::digits) {
+  if (detail::const_check(F::digits <= T::digits)) {
     // From fits in To without any problem.
   } else {
     // From does not always fit in To, resort to a dynamic check.
@@ -79,14 +95,15 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
       return {};
     }
     // From is positive. Can it always fit in To?
-    if (F::digits > T::digits &&
+    if (detail::const_check(F::digits > T::digits) &&
         from > static_cast<From>(detail::max_value<To>())) {
       ec = 1;
       return {};
     }
   }
 
-  if (!F::is_signed && T::is_signed && F::digits >= T::digits &&
+  if (detail::const_check(!F::is_signed && T::is_signed &&
+                          F::digits >= T::digits) &&
       from > static_cast<From>(detail::max_value<To>())) {
     ec = 1;
     return {};
@@ -243,7 +260,7 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
   }
 
   // multiply with Factor::num without overflow or underflow
-  if (Factor::num != 1) {
+  if (detail::const_check(Factor::num != 1)) {
     constexpr auto max1 = detail::max_value<IntermediateRep>() /
                           static_cast<IntermediateRep>(Factor::num);
     if (count > max1) {
@@ -260,7 +277,7 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
   }
 
   // this can't go wrong, right? den>0 is checked earlier.
-  if (Factor::den != 1) {
+  if (detail::const_check(Factor::den != 1)) {
     using common_t = typename std::common_type<IntermediateRep, intmax_t>::type;
     count /= static_cast<common_t>(Factor::den);
   }
@@ -282,13 +299,154 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
 #define FMT_NOMACRO
 
 namespace detail {
+template <typename T = void> struct null {};
 inline null<> localtime_r FMT_NOMACRO(...) { return null<>(); }
 inline null<> localtime_s(...) { return null<>(); }
 inline null<> gmtime_r(...) { return null<>(); }
 inline null<> gmtime_s(...) { return null<>(); }
+
+inline const std::locale& get_classic_locale() {
+  static const auto& locale = std::locale::classic();
+  return locale;
+}
+
+template <typename CodeUnit> struct codecvt_result {
+  static constexpr const size_t max_size = 32;
+  CodeUnit buf[max_size];
+  CodeUnit* end;
+};
+template <typename CodeUnit>
+constexpr const size_t codecvt_result<CodeUnit>::max_size;
+
+template <typename CodeUnit>
+void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
+                   const std::locale& loc) {
+  using codecvt = std::codecvt<CodeUnit, char, std::mbstate_t>;
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wdeprecated"
+  auto& f = std::use_facet<codecvt>(loc);
+#  pragma clang diagnostic pop
+#else
+  auto& f = std::use_facet<codecvt>(loc);
+#endif
+  auto mb = std::mbstate_t();
+  const char* from_next = nullptr;
+  auto result = f.in(mb, in_buf.begin(), in_buf.end(), from_next,
+                     std::begin(out.buf), std::end(out.buf), out.end);
+  if (result != std::codecvt_base::ok)
+    FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename OutputIt>
+auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
+    -> OutputIt {
+  if (detail::is_utf8() && loc != get_classic_locale()) {
+    // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
+    // gcc-4.
+#if FMT_MSC_VER != 0 || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI))
+    // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
+    // and newer.
+    using code_unit = wchar_t;
+#else
+    using code_unit = char32_t;
+#endif
+
+    using unit_t = codecvt_result<code_unit>;
+    unit_t unit;
+    write_codecvt(unit, in, loc);
+    // In UTF-8 is used one to four one-byte code units.
+    auto&& buf = basic_memory_buffer<char, unit_t::max_size * 4>();
+    for (code_unit* p = unit.buf; p != unit.end; ++p) {
+      uint32_t c = static_cast<uint32_t>(*p);
+      if (sizeof(code_unit) == 2 && c >= 0xd800 && c <= 0xdfff) {
+        // surrogate pair
+        ++p;
+        if (p == unit.end || (c & 0xfc00) != 0xd800 ||
+            (*p & 0xfc00) != 0xdc00) {
+          FMT_THROW(format_error("failed to format time"));
+        }
+        c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
+      }
+      if (c < 0x80) {
+        buf.push_back(static_cast<char>(c));
+      } else if (c < 0x800) {
+        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if (c >= 0x10000 && c <= 0x10ffff) {
+        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else {
+        FMT_THROW(format_error("failed to format time"));
+      }
+    }
+    return copy_str<char>(buf.data(), buf.data() + buf.size(), out);
+  }
+  return copy_str<char>(in.data(), in.data() + in.size(), out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  codecvt_result<Char> unit;
+  write_codecvt(unit, sv, loc);
+  return copy_str<Char>(unit.buf, unit.end, out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  return write_encoded_tm_str(out, sv, loc);
+}
+
+template <typename Char>
+inline void do_write(buffer<Char>& buf, const std::tm& time,
+                     const std::locale& loc, char format, char modifier) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& os = std::basic_ostream<Char>(&format_buf);
+  os.imbue(loc);
+  using iterator = std::ostreambuf_iterator<Char>;
+  const auto& facet = std::use_facet<std::time_put<Char, iterator>>(loc);
+  auto end = facet.put(os, os, Char(' '), &time, format, modifier);
+  if (end.failed()) FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = get_buffer<Char>(out);
+  do_write<Char>(buf, time, loc, format, modifier);
+  return buf.out();
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = basic_memory_buffer<Char>();
+  do_write<char>(buf, time, loc, format, modifier);
+  return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc);
+}
+
 }  // namespace detail
 
-// Thread-safe replacement for std::localtime
+FMT_MODULE_EXPORT_BEGIN
+
+/**
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in local time. Unlike ``std::localtime``, this function is
+  thread-safe on most platforms.
+ */
 inline std::tm localtime(std::time_t time) {
   struct dispatcher {
     std::time_t time_;
@@ -330,7 +488,11 @@ inline std::tm localtime(
   return localtime(std::chrono::system_clock::to_time_t(time_point));
 }
 
-// Thread-safe replacement for std::gmtime
+/**
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this
+  function is thread-safe on most platforms.
+ */
 inline std::tm gmtime(std::time_t time) {
   struct dispatcher {
     std::time_t time_;
@@ -371,96 +533,56 @@ inline std::tm gmtime(
   return gmtime(std::chrono::system_clock::to_time_t(time_point));
 }
 
-namespace detail {
-inline size_t strftime(char* str, size_t count, const char* format,
-                       const std::tm* time) {
-  return std::strftime(str, count, format, time);
+FMT_BEGIN_DETAIL_NAMESPACE
+
+// Writes two-digit numbers a, b and c separated by sep to buf.
+// The method by Pavel Novikov based on
+// https://johnnylee-sde.github.io/Fast-unsigned-integer-to-time-string/.
+inline void write_digit2_separated(char* buf, unsigned a, unsigned b,
+                                   unsigned c, char sep) {
+  unsigned long long digits =
+      a | (b << 24) | (static_cast<unsigned long long>(c) << 48);
+  // Convert each value to BCD.
+  // We have x = a * 10 + b and we want to convert it to BCD y = a * 16 + b.
+  // The difference is
+  //   y - x = a * 6
+  // a can be found from x:
+  //   a = floor(x / 10)
+  // then
+  //   y = x + a * 6 = x + floor(x / 10) * 6
+  // floor(x / 10) is (x * 205) >> 11 (needs 16 bits).
+  digits += (((digits * 205) >> 11) & 0x000f00000f00000f) * 6;
+  // Put low nibbles to high bytes and high nibbles to low bytes.
+  digits = ((digits & 0x00f00000f00000f0) >> 4) |
+           ((digits & 0x000f00000f00000f) << 8);
+  auto usep = static_cast<unsigned long long>(sep);
+  // Add ASCII '0' to each digit byte and insert separators.
+  digits |= 0x3030003030003030 | (usep << 16) | (usep << 40);
+  memcpy(buf, &digits, 8);
 }
 
-inline size_t strftime(wchar_t* str, size_t count, const wchar_t* format,
-                       const std::tm* time) {
-  return std::wcsftime(str, count, format, time);
-}
-}  // namespace detail
-
-template <typename Char>
-struct formatter<std::chrono::time_point<std::chrono::system_clock>, Char>
-    : formatter<std::tm, Char> {
-  template <typename FormatContext>
-  auto format(std::chrono::time_point<std::chrono::system_clock> val,
-              FormatContext& ctx) -> decltype(ctx.out()) {
-    std::tm time = localtime(val);
-    return formatter<std::tm, Char>::format(time, ctx);
-  }
-};
-
-template <typename Char> struct formatter<std::tm, Char> {
-  template <typename ParseContext>
-  auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    auto it = ctx.begin();
-    if (it != ctx.end() && *it == ':') ++it;
-    auto end = it;
-    while (end != ctx.end() && *end != '}') ++end;
-    tm_format.reserve(detail::to_unsigned(end - it + 1));
-    tm_format.append(it, end);
-    tm_format.push_back('\0');
-    return end;
-  }
-
-  template <typename FormatContext>
-  auto format(const std::tm& tm, FormatContext& ctx) -> decltype(ctx.out()) {
-    basic_memory_buffer<Char> buf;
-    size_t start = buf.size();
-    for (;;) {
-      size_t size = buf.capacity() - start;
-      size_t count = detail::strftime(&buf[start], size, &tm_format[0], &tm);
-      if (count != 0) {
-        buf.resize(start + count);
-        break;
-      }
-      if (size >= tm_format.size() * 256) {
-        // If the buffer is 256 times larger than the format string, assume
-        // that `strftime` gives an empty result. There doesn't seem to be a
-        // better way to distinguish the two cases:
-        // https://github.com/fmtlib/fmt/issues/367
-        break;
-      }
-      const size_t MIN_GROWTH = 10;
-      buf.reserve(buf.capacity() + (size > MIN_GROWTH ? size : MIN_GROWTH));
-    }
-    return std::copy(buf.begin(), buf.end(), ctx.out());
-  }
-
-  basic_memory_buffer<Char> tm_format;
-};
-
-namespace detail {
-template <typename Period> FMT_CONSTEXPR const char* get_units() {
+template <typename Period> FMT_CONSTEXPR inline const char* get_units() {
+  if (std::is_same<Period, std::atto>::value) return "as";
+  if (std::is_same<Period, std::femto>::value) return "fs";
+  if (std::is_same<Period, std::pico>::value) return "ps";
+  if (std::is_same<Period, std::nano>::value) return "ns";
+  if (std::is_same<Period, std::micro>::value) return "µs";
+  if (std::is_same<Period, std::milli>::value) return "ms";
+  if (std::is_same<Period, std::centi>::value) return "cs";
+  if (std::is_same<Period, std::deci>::value) return "ds";
+  if (std::is_same<Period, std::ratio<1>>::value) return "s";
+  if (std::is_same<Period, std::deca>::value) return "das";
+  if (std::is_same<Period, std::hecto>::value) return "hs";
+  if (std::is_same<Period, std::kilo>::value) return "ks";
+  if (std::is_same<Period, std::mega>::value) return "Ms";
+  if (std::is_same<Period, std::giga>::value) return "Gs";
+  if (std::is_same<Period, std::tera>::value) return "Ts";
+  if (std::is_same<Period, std::peta>::value) return "Ps";
+  if (std::is_same<Period, std::exa>::value) return "Es";
+  if (std::is_same<Period, std::ratio<60>>::value) return "m";
+  if (std::is_same<Period, std::ratio<3600>>::value) return "h";
   return nullptr;
 }
-template <> FMT_CONSTEXPR const char* get_units<std::atto>() { return "as"; }
-template <> FMT_CONSTEXPR const char* get_units<std::femto>() { return "fs"; }
-template <> FMT_CONSTEXPR const char* get_units<std::pico>() { return "ps"; }
-template <> FMT_CONSTEXPR const char* get_units<std::nano>() { return "ns"; }
-template <> FMT_CONSTEXPR const char* get_units<std::micro>() { return "µs"; }
-template <> FMT_CONSTEXPR const char* get_units<std::milli>() { return "ms"; }
-template <> FMT_CONSTEXPR const char* get_units<std::centi>() { return "cs"; }
-template <> FMT_CONSTEXPR const char* get_units<std::deci>() { return "ds"; }
-template <> FMT_CONSTEXPR const char* get_units<std::ratio<1>>() { return "s"; }
-template <> FMT_CONSTEXPR const char* get_units<std::deca>() { return "das"; }
-template <> FMT_CONSTEXPR const char* get_units<std::hecto>() { return "hs"; }
-template <> FMT_CONSTEXPR const char* get_units<std::kilo>() { return "ks"; }
-template <> FMT_CONSTEXPR const char* get_units<std::mega>() { return "Ms"; }
-template <> FMT_CONSTEXPR const char* get_units<std::giga>() { return "Gs"; }
-template <> FMT_CONSTEXPR const char* get_units<std::tera>() { return "Ts"; }
-template <> FMT_CONSTEXPR const char* get_units<std::peta>() { return "Ps"; }
-template <> FMT_CONSTEXPR const char* get_units<std::exa>() { return "Es"; }
-template <> FMT_CONSTEXPR const char* get_units<std::ratio<60>>() {
-  return "m";
-}
-template <> FMT_CONSTEXPR const char* get_units<std::ratio<3600>>() {
-  return "h";
-}
 
 enum class numeric_system {
   standard,
@@ -499,6 +621,22 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       handler.on_text(tab, tab + 1);
       break;
     }
+    // Year:
+    case 'Y':
+      handler.on_year(numeric_system::standard);
+      break;
+    case 'y':
+      handler.on_short_year(numeric_system::standard);
+      break;
+    case 'C':
+      handler.on_century(numeric_system::standard);
+      break;
+    case 'G':
+      handler.on_iso_week_based_year();
+      break;
+    case 'g':
+      handler.on_iso_week_based_short_year();
+      break;
     // Day of the week:
     case 'a':
       handler.on_abbr_weekday();
@@ -514,11 +652,34 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       break;
     // Month:
     case 'b':
+    case 'h':
       handler.on_abbr_month();
       break;
     case 'B':
       handler.on_full_month();
       break;
+    case 'm':
+      handler.on_dec_month(numeric_system::standard);
+      break;
+    // Day of the year/month:
+    case 'U':
+      handler.on_dec0_week_of_year(numeric_system::standard);
+      break;
+    case 'W':
+      handler.on_dec1_week_of_year(numeric_system::standard);
+      break;
+    case 'V':
+      handler.on_iso_week_of_year(numeric_system::standard);
+      break;
+    case 'j':
+      handler.on_day_of_year();
+      break;
+    case 'd':
+      handler.on_day_of_month(numeric_system::standard);
+      break;
+    case 'e':
+      handler.on_day_of_month_space(numeric_system::standard);
+      break;
     // Hour, minute, second:
     case 'H':
       handler.on_24_hour(numeric_system::standard);
@@ -577,6 +738,15 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       if (ptr == end) FMT_THROW(format_error("invalid format"));
       c = *ptr++;
       switch (c) {
+      case 'Y':
+        handler.on_year(numeric_system::alternative);
+        break;
+      case 'y':
+        handler.on_offset_year();
+        break;
+      case 'C':
+        handler.on_century(numeric_system::alternative);
+        break;
       case 'c':
         handler.on_datetime(numeric_system::alternative);
         break;
@@ -595,6 +765,27 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       if (ptr == end) FMT_THROW(format_error("invalid format"));
       c = *ptr++;
       switch (c) {
+      case 'y':
+        handler.on_short_year(numeric_system::alternative);
+        break;
+      case 'm':
+        handler.on_dec_month(numeric_system::alternative);
+        break;
+      case 'U':
+        handler.on_dec0_week_of_year(numeric_system::alternative);
+        break;
+      case 'W':
+        handler.on_dec1_week_of_year(numeric_system::alternative);
+        break;
+      case 'V':
+        handler.on_iso_week_of_year(numeric_system::alternative);
+        break;
+      case 'd':
+        handler.on_day_of_month(numeric_system::alternative);
+        break;
+      case 'e':
+        handler.on_day_of_month_space(numeric_system::alternative);
+        break;
       case 'w':
         handler.on_dec0_weekday(numeric_system::alternative);
         break;
@@ -626,33 +817,566 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
   return ptr;
 }
 
-struct chrono_format_checker {
-  FMT_NORETURN void report_no_date() { FMT_THROW(format_error("no date")); }
+template <typename Derived> struct null_chrono_spec_handler {
+  FMT_CONSTEXPR void unsupported() {
+    static_cast<Derived*>(this)->unsupported();
+  }
+  FMT_CONSTEXPR void on_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_short_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_offset_year() { unsupported(); }
+  FMT_CONSTEXPR void on_century(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_year() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_short_year() { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_full_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_month() { unsupported(); }
+  FMT_CONSTEXPR void on_full_month() { unsupported(); }
+  FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_year() { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_second(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_datetime(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_date(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_time(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_us_date() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_date() { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_24_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_time() { unsupported(); }
+  FMT_CONSTEXPR void on_am_pm() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_value() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_unit() { unsupported(); }
+  FMT_CONSTEXPR void on_utc_offset() { unsupported(); }
+  FMT_CONSTEXPR void on_tz_name() { unsupported(); }
+};
 
-  template <typename Char> void on_text(const Char*, const Char*) {}
-  FMT_NORETURN void on_abbr_weekday() { report_no_date(); }
-  FMT_NORETURN void on_full_weekday() { report_no_date(); }
-  FMT_NORETURN void on_dec0_weekday(numeric_system) { report_no_date(); }
-  FMT_NORETURN void on_dec1_weekday(numeric_system) { report_no_date(); }
-  FMT_NORETURN void on_abbr_month() { report_no_date(); }
-  FMT_NORETURN void on_full_month() { report_no_date(); }
-  void on_24_hour(numeric_system) {}
-  void on_12_hour(numeric_system) {}
-  void on_minute(numeric_system) {}
-  void on_second(numeric_system) {}
-  FMT_NORETURN void on_datetime(numeric_system) { report_no_date(); }
-  FMT_NORETURN void on_loc_date(numeric_system) { report_no_date(); }
-  FMT_NORETURN void on_loc_time(numeric_system) { report_no_date(); }
-  FMT_NORETURN void on_us_date() { report_no_date(); }
-  FMT_NORETURN void on_iso_date() { report_no_date(); }
-  void on_12_hour_time() {}
-  void on_24_hour_time() {}
-  void on_iso_time() {}
-  void on_am_pm() {}
+struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no format")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_year(numeric_system) {}
+  FMT_CONSTEXPR void on_short_year(numeric_system) {}
+  FMT_CONSTEXPR void on_offset_year() {}
+  FMT_CONSTEXPR void on_century(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_based_year() {}
+  FMT_CONSTEXPR void on_iso_week_based_short_year() {}
+  FMT_CONSTEXPR void on_abbr_weekday() {}
+  FMT_CONSTEXPR void on_full_weekday() {}
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_abbr_month() {}
+  FMT_CONSTEXPR void on_full_month() {}
+  FMT_CONSTEXPR void on_dec_month(numeric_system) {}
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system) {}
+  FMT_CONSTEXPR void on_minute(numeric_system) {}
+  FMT_CONSTEXPR void on_second(numeric_system) {}
+  FMT_CONSTEXPR void on_datetime(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_date(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_time(numeric_system) {}
+  FMT_CONSTEXPR void on_us_date() {}
+  FMT_CONSTEXPR void on_iso_date() {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_utc_offset() {}
+  FMT_CONSTEXPR void on_tz_name() {}
+};
+
+inline const char* tm_wday_full_name(int wday) {
+  static constexpr const char* full_name_list[] = {
+      "Sunday",   "Monday", "Tuesday", "Wednesday",
+      "Thursday", "Friday", "Saturday"};
+  return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?";
+}
+inline const char* tm_wday_short_name(int wday) {
+  static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed",
+                                                    "Thu", "Fri", "Sat"};
+  return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???";
+}
+
+inline const char* tm_mon_full_name(int mon) {
+  static constexpr const char* full_name_list[] = {
+      "January", "February", "March",     "April",   "May",      "June",
+      "July",    "August",   "September", "October", "November", "December"};
+  return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?";
+}
+inline const char* tm_mon_short_name(int mon) {
+  static constexpr const char* short_name_list[] = {
+      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+  };
+  return mon >= 0 && mon <= 11 ? short_name_list[mon] : "???";
+}
+
+template <typename T, typename = void>
+struct has_member_data_tm_gmtoff : std::false_type {};
+template <typename T>
+struct has_member_data_tm_gmtoff<T, void_t<decltype(T::tm_gmtoff)>>
+    : std::true_type {};
+
+template <typename T, typename = void>
+struct has_member_data_tm_zone : std::false_type {};
+template <typename T>
+struct has_member_data_tm_zone<T, void_t<decltype(T::tm_zone)>>
+    : std::true_type {};
+
+#if FMT_USE_TZSET
+inline void tzset_once() {
+  static bool init = []() -> bool {
+    _tzset();
+    return true;
+  }();
+  ignore_unused(init);
+}
+#endif
+
+template <typename OutputIt, typename Char> class tm_writer {
+ private:
+  static constexpr int days_per_week = 7;
+
+  const std::locale& loc_;
+  const bool is_classic_;
+  OutputIt out_;
+  const std::tm& tm_;
+
+  auto tm_sec() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_sec >= 0 && tm_.tm_sec <= 61, "");
+    return tm_.tm_sec;
+  }
+  auto tm_min() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_min >= 0 && tm_.tm_min <= 59, "");
+    return tm_.tm_min;
+  }
+  auto tm_hour() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_hour >= 0 && tm_.tm_hour <= 23, "");
+    return tm_.tm_hour;
+  }
+  auto tm_mday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mday >= 1 && tm_.tm_mday <= 31, "");
+    return tm_.tm_mday;
+  }
+  auto tm_mon() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mon >= 0 && tm_.tm_mon <= 11, "");
+    return tm_.tm_mon;
+  }
+  auto tm_year() const noexcept -> long long { return 1900ll + tm_.tm_year; }
+  auto tm_wday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_wday >= 0 && tm_.tm_wday <= 6, "");
+    return tm_.tm_wday;
+  }
+  auto tm_yday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_yday >= 0 && tm_.tm_yday <= 365, "");
+    return tm_.tm_yday;
+  }
+
+  auto tm_hour12() const noexcept -> int {
+    const auto h = tm_hour();
+    const auto z = h < 12 ? h : h - 12;
+    return z == 0 ? 12 : z;
+  }
+
+  // POSIX and the C Standard are unclear or inconsistent about what %C and %y
+  // do if the year is negative or exceeds 9999. Use the convention that %C
+  // concatenated with %y yields the same output as %Y, and that %Y contains at
+  // least 4 characters, with more only if necessary.
+  auto split_year_lower(long long year) const noexcept -> int {
+    auto l = year % 100;
+    if (l < 0) l = -l;  // l in [0, 99]
+    return static_cast<int>(l);
+  }
+
+  // Algorithm:
+  // https://en.wikipedia.org/wiki/ISO_week_date#Calculating_the_week_number_from_a_month_and_day_of_the_month_or_ordinal_date
+  auto iso_year_weeks(long long curr_year) const noexcept -> int {
+    const auto prev_year = curr_year - 1;
+    const auto curr_p =
+        (curr_year + curr_year / 4 - curr_year / 100 + curr_year / 400) %
+        days_per_week;
+    const auto prev_p =
+        (prev_year + prev_year / 4 - prev_year / 100 + prev_year / 400) %
+        days_per_week;
+    return 52 + ((curr_p == 4 || prev_p == 3) ? 1 : 0);
+  }
+  auto iso_week_num(int tm_yday, int tm_wday) const noexcept -> int {
+    return (tm_yday + 11 - (tm_wday == 0 ? days_per_week : tm_wday)) /
+           days_per_week;
+  }
+  auto tm_iso_week_year() const noexcept -> long long {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return year - 1;
+    if (w > iso_year_weeks(year)) return year + 1;
+    return year;
+  }
+  auto tm_iso_week_of_year() const noexcept -> int {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return iso_year_weeks(year - 1);
+    if (w > iso_year_weeks(year)) return 1;
+    return w;
+  }
+
+  void write1(int value) {
+    *out_++ = static_cast<char>('0' + to_unsigned(value) % 10);
+  }
+  void write2(int value) {
+    const char* d = digits2(to_unsigned(value) % 100);
+    *out_++ = *d++;
+    *out_++ = *d;
+  }
+
+  void write_year_extended(long long year) {
+    // At least 4 characters.
+    int width = 4;
+    if (year < 0) {
+      *out_++ = '-';
+      year = 0 - year;
+      --width;
+    }
+    uint32_or_64_or_128_t<long long> n = to_unsigned(year);
+    const int num_digits = count_digits(n);
+    if (width > num_digits) out_ = std::fill_n(out_, width - num_digits, '0');
+    out_ = format_decimal<Char>(out_, n, num_digits).end;
+  }
+  void write_year(long long year) {
+    if (year >= 0 && year < 10000) {
+      write2(static_cast<int>(year / 100));
+      write2(static_cast<int>(year % 100));
+    } else {
+      write_year_extended(year);
+    }
+  }
+
+  void write_utc_offset(long offset) {
+    if (offset < 0) {
+      *out_++ = '-';
+      offset = -offset;
+    } else {
+      *out_++ = '+';
+    }
+    offset /= 60;
+    write2(static_cast<int>(offset / 60));
+    write2(static_cast<int>(offset % 60));
+  }
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm) {
+    write_utc_offset(tm.tm_gmtoff);
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm) {
+#if defined(_WIN32) && defined(_UCRT)
+#  if FMT_USE_TZSET
+    tzset_once();
+#  endif
+    long offset = 0;
+    _get_timezone(&offset);
+    if (tm.tm_isdst) {
+      long dstbias = 0;
+      _get_dstbias(&dstbias);
+      offset += dstbias;
+    }
+    write_utc_offset(-offset);
+#else
+    ignore_unused(tm);
+    format_localized('z');
+#endif
+  }
+
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T& tm) {
+    if (is_classic_)
+      out_ = write_tm_str<Char>(out_, tm.tm_zone, loc_);
+    else
+      format_localized('Z');
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T&) {
+    format_localized('Z');
+  }
+
+  void format_localized(char format, char modifier = 0) {
+    out_ = write<Char>(out_, tm_, loc_, format, modifier);
+  }
+
+ public:
+  tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm)
+      : loc_(loc),
+        is_classic_(loc_ == get_classic_locale()),
+        out_(out),
+        tm_(tm) {}
+
+  OutputIt out() const { return out_; }
+
+  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
+    out_ = copy_str<Char>(begin, end, out_);
+  }
+
+  void on_abbr_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_short_name(tm_wday()));
+    else
+      format_localized('a');
+  }
+  void on_full_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_full_name(tm_wday()));
+    else
+      format_localized('A');
+  }
+  void on_dec0_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write1(tm_wday());
+    format_localized('w', 'O');
+  }
+  void on_dec1_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write1(wday == 0 ? days_per_week : wday);
+    } else {
+      format_localized('u', 'O');
+    }
+  }
+
+  void on_abbr_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_short_name(tm_mon()));
+    else
+      format_localized('b');
+  }
+  void on_full_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_full_name(tm_mon()));
+    else
+      format_localized('B');
+  }
+
+  void on_datetime(numeric_system ns) {
+    if (is_classic_) {
+      on_abbr_weekday();
+      *out_++ = ' ';
+      on_abbr_month();
+      *out_++ = ' ';
+      on_day_of_month_space(numeric_system::standard);
+      *out_++ = ' ';
+      on_iso_time();
+      *out_++ = ' ';
+      on_year(numeric_system::standard);
+    } else {
+      format_localized('c', ns == numeric_system::standard ? '\0' : 'E');
+    }
+  }
+  void on_loc_date(numeric_system ns) {
+    if (is_classic_)
+      on_us_date();
+    else
+      format_localized('x', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_loc_time(numeric_system ns) {
+    if (is_classic_)
+      on_iso_time();
+    else
+      format_localized('X', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_us_date() {
+    char buf[8];
+    write_digit2_separated(buf, to_unsigned(tm_mon() + 1),
+                           to_unsigned(tm_mday()),
+                           to_unsigned(split_year_lower(tm_year())), '/');
+    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+  }
+  void on_iso_date() {
+    auto year = tm_year();
+    char buf[10];
+    size_t offset = 0;
+    if (year >= 0 && year < 10000) {
+      copy2(buf, digits2(to_unsigned(year / 100)));
+    } else {
+      offset = 4;
+      write_year_extended(year);
+      year = 0;
+    }
+    write_digit2_separated(buf + 2, static_cast<unsigned>(year % 100),
+                           to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()),
+                           '-');
+    out_ = copy_str<Char>(std::begin(buf) + offset, std::end(buf), out_);
+  }
+
+  void on_utc_offset() { format_utc_offset_impl(tm_); }
+  void on_tz_name() { format_tz_name_impl(tm_); }
+
+  void on_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write_year(tm_year());
+    format_localized('Y', 'E');
+  }
+  void on_short_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(split_year_lower(tm_year()));
+    format_localized('y', 'O');
+  }
+  void on_offset_year() {
+    if (is_classic_) return write2(split_year_lower(tm_year()));
+    format_localized('y', 'E');
+  }
+
+  void on_century(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto year = tm_year();
+      auto upper = year / 100;
+      if (year >= -99 && year < 0) {
+        // Zero upper on negative year.
+        *out_++ = '-';
+        *out_++ = '0';
+      } else if (upper >= 0 && upper < 100) {
+        write2(static_cast<int>(upper));
+      } else {
+        out_ = write<Char>(out_, upper);
+      }
+    } else {
+      format_localized('C', 'E');
+    }
+  }
+
+  void on_dec_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_mon() + 1);
+    format_localized('m', 'O');
+  }
+
+  void on_dec0_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week);
+    format_localized('U', 'O');
+  }
+  void on_dec1_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write2((tm_yday() + days_per_week -
+              (wday == 0 ? (days_per_week - 1) : (wday - 1))) /
+             days_per_week);
+    } else {
+      format_localized('W', 'O');
+    }
+  }
+  void on_iso_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_iso_week_of_year());
+    format_localized('V', 'O');
+  }
+
+  void on_iso_week_based_year() { write_year(tm_iso_week_year()); }
+  void on_iso_week_based_short_year() {
+    write2(split_year_lower(tm_iso_week_year()));
+  }
+
+  void on_day_of_year() {
+    auto yday = tm_yday() + 1;
+    write1(yday / 100);
+    write2(yday % 100);
+  }
+  void on_day_of_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_mday());
+    format_localized('d', 'O');
+  }
+  void on_day_of_month_space(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto mday = to_unsigned(tm_mday()) % 100;
+      const char* d2 = digits2(mday);
+      *out_++ = mday < 10 ? ' ' : d2[0];
+      *out_++ = d2[1];
+    } else {
+      format_localized('e', 'O');
+    }
+  }
+
+  void on_24_hour(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_hour());
+    format_localized('H', 'O');
+  }
+  void on_12_hour(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour12());
+    format_localized('I', 'O');
+  }
+  void on_minute(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_min());
+    format_localized('M', 'O');
+  }
+  void on_second(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_sec());
+    format_localized('S', 'O');
+  }
+
+  void on_12_hour_time() {
+    if (is_classic_) {
+      char buf[8];
+      write_digit2_separated(buf, to_unsigned(tm_hour12()),
+                             to_unsigned(tm_min()), to_unsigned(tm_sec()), ':');
+      out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+      *out_++ = ' ';
+      on_am_pm();
+    } else {
+      format_localized('r');
+    }
+  }
+  void on_24_hour_time() {
+    write2(tm_hour());
+    *out_++ = ':';
+    write2(tm_min());
+  }
+  void on_iso_time() {
+    char buf[8];
+    write_digit2_separated(buf, to_unsigned(tm_hour()), to_unsigned(tm_min()),
+                           to_unsigned(tm_sec()), ':');
+    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+  }
+
+  void on_am_pm() {
+    if (is_classic_) {
+      *out_++ = tm_hour() < 12 ? 'A' : 'P';
+      *out_++ = 'M';
+    } else {
+      format_localized('p');
+    }
+  }
+
+  // These apply to chrono durations but not tm.
   void on_duration_value() {}
   void on_duration_unit() {}
-  FMT_NORETURN void on_utc_offset() { report_no_date(); }
-  FMT_NORETURN void on_tz_name() { report_no_date(); }
+};
+
+struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system) {}
+  FMT_CONSTEXPR void on_minute(numeric_system) {}
+  FMT_CONSTEXPR void on_second(numeric_system) {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_duration_value() {}
+  FMT_CONSTEXPR void on_duration_unit() {}
 };
 
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
@@ -668,25 +1392,20 @@ template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
 inline bool isfinite(T) {
   return true;
 }
-template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-inline bool isfinite(T value) {
-  return std::isfinite(value);
-}
 
-// Converts value to int and checks that it's in the range [0, upper).
-template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline int to_nonnegative_int(T value, int upper) {
-  FMT_ASSERT(value >= 0 && value <= upper, "invalid value");
+// Converts value to Int and checks that it's in the range [0, upper).
+template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline Int to_nonnegative_int(T value, Int upper) {
+  FMT_ASSERT(value >= 0 && to_unsigned(value) <= to_unsigned(upper),
+             "invalid value");
   (void)upper;
-  return static_cast<int>(value);
+  return static_cast<Int>(value);
 }
-template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-inline int to_nonnegative_int(T value, int upper) {
-  FMT_ASSERT(
-      std::isnan(value) || (value >= 0 && value <= static_cast<T>(upper)),
-      "invalid value");
-  (void)upper;
-  return static_cast<int>(value);
+template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+inline Int to_nonnegative_int(T value, Int upper) {
+  if (value < 0 || value > static_cast<T>(upper))
+    FMT_THROW(format_error("invalid value"));
+  return static_cast<Int>(value);
 }
 
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
@@ -743,26 +1462,55 @@ inline std::chrono::duration<Rep, std::milli> get_milliseconds(
 #endif
 }
 
-template <typename Rep, typename Period,
-          FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
-inline std::chrono::duration<Rep, std::milli> get_milliseconds(
-    std::chrono::duration<Rep, Period> d) {
-  using common_type = typename std::common_type<Rep, std::intmax_t>::type;
-  auto ms = mod(d.count() * static_cast<common_type>(Period::num) /
-                    static_cast<common_type>(Period::den) * 1000,
-                1000);
-  return std::chrono::duration<Rep, std::milli>(static_cast<Rep>(ms));
+// Returns the number of fractional digits in the range [0, 18] according to the
+// C++20 spec. If more than 18 fractional digits are required then returns 6 for
+// microseconds precision.
+constexpr int count_fractional_digits(long long num, long long den, int n = 0) {
+  return num % den == 0
+             ? n
+             : (n > 18 ? 6 : count_fractional_digits(num * 10, den, n + 1));
 }
 
-template <typename Char, typename Rep, typename OutputIt>
-OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
-  const Char pr_f[] = {'{', ':', '.', '{', '}', 'f', '}', 0};
-  if (precision >= 0) return format_to(out, pr_f, val, precision);
-  const Char fp_f[] = {'{', ':', 'g', '}', 0};
-  const Char format[] = {'{', '}', 0};
-  return format_to(out, std::is_floating_point<Rep>::value ? fp_f : format,
-                   val);
+constexpr long long pow10(std::uint32_t n) {
+  return n == 0 ? 1 : 10 * pow10(n - 1);
 }
+
+template <class Rep, class Period,
+          FMT_ENABLE_IF(std::numeric_limits<Rep>::is_signed)>
+constexpr std::chrono::duration<Rep, Period> abs(
+    std::chrono::duration<Rep, Period> d) {
+  // We need to compare the duration using the count() method directly
+  // due to a compiler bug in clang-11 regarding the spaceship operator,
+  // when -Wzero-as-null-pointer-constant is enabled.
+  // In clang-12 the bug has been fixed. See
+  // https://bugs.llvm.org/show_bug.cgi?id=46235 and the reproducible example:
+  // https://www.godbolt.org/z/Knbb5joYx.
+  return d.count() >= d.zero().count() ? d : -d;
+}
+
+template <class Rep, class Period,
+          FMT_ENABLE_IF(!std::numeric_limits<Rep>::is_signed)>
+constexpr std::chrono::duration<Rep, Period> abs(
+    std::chrono::duration<Rep, Period> d) {
+  return d;
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_integral<Rep>::value)>
+OutputIt format_duration_value(OutputIt out, Rep val, int) {
+  return write<Char>(out, val);
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
+OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
+  auto specs = basic_format_specs<Char>();
+  specs.precision = precision;
+  specs.type = precision >= 0 ? presentation_type::fixed_lower
+                              : presentation_type::general_lower;
+  return write<Char>(out, val, specs);
+}
+
 template <typename Char, typename OutputIt>
 OutputIt copy_unit(string_view unit, OutputIt out, Char) {
   return std::copy(unit.begin(), unit.end(), out);
@@ -780,18 +1528,44 @@ template <typename Char, typename Period, typename OutputIt>
 OutputIt format_duration_unit(OutputIt out) {
   if (const char* unit = get_units<Period>())
     return copy_unit(string_view(unit), out, Char());
-  const Char num_f[] = {'[', '{', '}', ']', 's', 0};
-  if (const_check(Period::den == 1)) return format_to(out, num_f, Period::num);
-  const Char num_def_f[] = {'[', '{', '}', '/', '{', '}', ']', 's', 0};
-  return format_to(out, num_def_f, Period::num, Period::den);
+  *out++ = '[';
+  out = write<Char>(out, Period::num);
+  if (const_check(Period::den != 1)) {
+    *out++ = '/';
+    out = write<Char>(out, Period::den);
+  }
+  *out++ = ']';
+  *out++ = 's';
+  return out;
 }
 
+class get_locale {
+ private:
+  union {
+    std::locale locale_;
+  };
+  bool has_locale_ = false;
+
+ public:
+  get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
+    if (localized)
+      ::new (&locale_) std::locale(loc.template get<std::locale>());
+  }
+  ~get_locale() {
+    if (has_locale_) locale_.~locale();
+  }
+  operator const std::locale&() const {
+    return has_locale_ ? locale_ : get_classic_locale();
+  }
+};
+
 template <typename FormatContext, typename OutputIt, typename Rep,
           typename Period>
 struct chrono_formatter {
   FormatContext& context;
   OutputIt out;
   int precision;
+  bool localized = false;
   // rep is unsigned to avoid overflow.
   using rep =
       conditional_t<std::is_integral<Rep>::value && sizeof(Rep) < sizeof(int),
@@ -803,9 +1577,10 @@ struct chrono_formatter {
   bool negative;
 
   using char_type = typename FormatContext::char_type;
+  using tm_writer_type = tm_writer<OutputIt, char_type>;
 
-  explicit chrono_formatter(FormatContext& ctx, OutputIt o,
-                            std::chrono::duration<Rep, Period> d)
+  chrono_formatter(FormatContext& ctx, OutputIt o,
+                   std::chrono::duration<Rep, Period> d)
       : context(ctx),
         out(o),
         val(static_cast<rep>(d.count())),
@@ -880,19 +1655,48 @@ struct chrono_formatter {
     out = format_decimal<char_type>(out, n, num_digits).end;
   }
 
+  template <class Duration> void write_fractional_seconds(Duration d) {
+    constexpr auto num_fractional_digits =
+        count_fractional_digits(Duration::period::num, Duration::period::den);
+
+    using subsecond_precision = std::chrono::duration<
+        typename std::common_type<typename Duration::rep,
+                                  std::chrono::seconds::rep>::type,
+        std::ratio<1, detail::pow10(num_fractional_digits)>>;
+    if (std::ratio_less<typename subsecond_precision::period,
+                        std::chrono::seconds::period>::value) {
+      *out++ = '.';
+      // Don't convert long double to integer seconds to avoid overflow.
+      using sec = conditional_t<
+          std::is_same<typename Duration::rep, long double>::value,
+          std::chrono::duration<long double>, std::chrono::seconds>;
+      auto fractional = detail::abs(d) - std::chrono::duration_cast<sec>(d);
+      const auto subseconds =
+          std::chrono::treat_as_floating_point<
+              typename subsecond_precision::rep>::value
+              ? fractional.count()
+              : std::chrono::duration_cast<subsecond_precision>(fractional)
+                    .count();
+      uint32_or_64_or_128_t<long long> n =
+          to_unsigned(to_nonnegative_int(subseconds, max_value<long long>()));
+      int num_digits = detail::count_digits(n);
+      if (num_fractional_digits > num_digits)
+        out = std::fill_n(out, num_fractional_digits - num_digits, '0');
+      out = format_decimal<char_type>(out, n, num_digits).end;
+    }
+  }
+
   void write_nan() { std::copy_n("nan", 3, out); }
   void write_pinf() { std::copy_n("inf", 3, out); }
   void write_ninf() { std::copy_n("-inf", 4, out); }
 
-  void format_localized(const tm& time, char format, char modifier = 0) {
+  template <typename Callback, typename... Args>
+  void format_tm(const tm& time, Callback cb, Args... args) {
     if (isnan(val)) return write_nan();
-    auto locale = context.locale().template get<std::locale>();
-    auto& facet = std::use_facet<std::time_put<char_type>>(locale);
-    std::basic_ostringstream<char_type> os;
-    os.imbue(locale);
-    facet.put(os, os, ' ', &time, format, modifier);
-    auto str = os.str();
-    std::copy(str.begin(), str.end(), out);
+    get_locale loc(localized, context.locale());
+    auto w = tm_writer_type(loc, out, time);
+    (w.*cb)(args...);
+    out = w.out();
   }
 
   void on_text(const char_type* begin, const char_type* end) {
@@ -913,6 +1717,19 @@ struct chrono_formatter {
   void on_iso_date() {}
   void on_utc_offset() {}
   void on_tz_name() {}
+  void on_year(numeric_system) {}
+  void on_short_year(numeric_system) {}
+  void on_offset_year() {}
+  void on_century(numeric_system) {}
+  void on_iso_week_based_year() {}
+  void on_iso_week_based_short_year() {}
+  void on_dec_month(numeric_system) {}
+  void on_dec0_week_of_year(numeric_system) {}
+  void on_dec1_week_of_year(numeric_system) {}
+  void on_iso_week_of_year(numeric_system) {}
+  void on_day_of_year() {}
+  void on_day_of_month(numeric_system) {}
+  void on_day_of_month_space(numeric_system) {}
 
   void on_24_hour(numeric_system ns) {
     if (handle_nan_inf()) return;
@@ -920,7 +1737,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(hour(), 2);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour(), 24);
-    format_localized(time, 'H', 'O');
+    format_tm(time, &tm_writer_type::on_24_hour, ns);
   }
 
   void on_12_hour(numeric_system ns) {
@@ -929,7 +1746,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(hour12(), 2);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour12(), 12);
-    format_localized(time, 'I', 'O');
+    format_tm(time, &tm_writer_type::on_12_hour, ns);
   }
 
   void on_minute(numeric_system ns) {
@@ -938,7 +1755,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(minute(), 2);
     auto time = tm();
     time.tm_min = to_nonnegative_int(minute(), 60);
-    format_localized(time, 'M', 'O');
+    format_tm(time, &tm_writer_type::on_minute, ns);
   }
 
   void on_second(numeric_system ns) {
@@ -946,29 +1763,17 @@ struct chrono_formatter {
 
     if (ns == numeric_system::standard) {
       write(second(), 2);
-#if FMT_SAFE_DURATION_CAST
-      // convert rep->Rep
-      using duration_rep = std::chrono::duration<rep, Period>;
-      using duration_Rep = std::chrono::duration<Rep, Period>;
-      auto tmpval = fmt_safe_duration_cast<duration_Rep>(duration_rep{val});
-#else
-      auto tmpval = std::chrono::duration<Rep, Period>(val);
-#endif
-      auto ms = get_milliseconds(tmpval);
-      if (ms != std::chrono::milliseconds(0)) {
-        *out++ = '.';
-        write(ms.count(), 3);
-      }
+      write_fractional_seconds(std::chrono::duration<rep, Period>{val});
       return;
     }
     auto time = tm();
     time.tm_sec = to_nonnegative_int(second(), 60);
-    format_localized(time, 'S', 'O');
+    format_tm(time, &tm_writer_type::on_second, ns);
   }
 
   void on_12_hour_time() {
     if (handle_nan_inf()) return;
-    format_localized(time(), 'r');
+    format_tm(time(), &tm_writer_type::on_12_hour_time);
   }
 
   void on_24_hour_time() {
@@ -987,12 +1792,12 @@ struct chrono_formatter {
     on_24_hour_time();
     *out++ = ':';
     if (handle_nan_inf()) return;
-    write(second(), 2);
+    on_second(numeric_system::standard);
   }
 
   void on_am_pm() {
     if (handle_nan_inf()) return;
-    format_localized(time(), 'p');
+    format_tm(time(), &tm_writer_type::on_am_pm);
   }
 
   void on_duration_value() {
@@ -1005,17 +1810,64 @@ struct chrono_formatter {
     out = format_duration_unit<char_type, Period>(out);
   }
 };
-}  // namespace detail
+
+FMT_END_DETAIL_NAMESPACE
+
+#if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907
+using weekday = std::chrono::weekday;
+#else
+// A fallback version of weekday.
+class weekday {
+ private:
+  unsigned char value;
+
+ public:
+  weekday() = default;
+  explicit constexpr weekday(unsigned wd) noexcept
+      : value(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
+  constexpr unsigned c_encoding() const noexcept { return value; }
+};
+
+class year_month_day {};
+#endif
+
+// A rudimentary weekday formatter.
+template <typename Char> struct formatter<weekday, Char> {
+ private:
+  bool localized = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto begin = ctx.begin(), end = ctx.end();
+    if (begin != end && *begin == 'L') {
+      ++begin;
+      localized = true;
+    }
+    return begin;
+  }
+
+  template <typename FormatContext>
+  auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_wday = static_cast<int>(wd.c_encoding());
+    detail::get_locale loc(localized, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_abbr_weekday();
+    return w.out();
+  }
+};
 
 template <typename Rep, typename Period, typename Char>
 struct formatter<std::chrono::duration<Rep, Period>, Char> {
  private:
   basic_format_specs<Char> specs;
-  int precision;
+  int precision = -1;
   using arg_ref_type = detail::arg_ref<Char>;
   arg_ref_type width_ref;
   arg_ref_type precision_ref;
-  mutable basic_string_view<Char> format_str;
+  bool localized = false;
+  basic_string_view<Char> format_str;
   using duration = std::chrono::duration<Rep, Period>;
 
   struct spec_handler {
@@ -1038,17 +1890,21 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
     }
 
     void on_error(const char* msg) { FMT_THROW(format_error(msg)); }
-    void on_fill(basic_string_view<Char> fill) { f.specs.fill = fill; }
-    void on_align(align_t align) { f.specs.align = align; }
-    void on_width(int width) { f.specs.width = width; }
-    void on_precision(int _precision) { f.precision = _precision; }
-    void end_precision() {}
+    FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
+      f.specs.fill = fill;
+    }
+    FMT_CONSTEXPR void on_align(align_t align) { f.specs.align = align; }
+    FMT_CONSTEXPR void on_width(int width) { f.specs.width = width; }
+    FMT_CONSTEXPR void on_precision(int _precision) {
+      f.precision = _precision;
+    }
+    FMT_CONSTEXPR void end_precision() {}
 
-    template <typename Id> void on_dynamic_width(Id arg_id) {
+    template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
       f.width_ref = make_arg_ref(arg_id);
     }
 
-    template <typename Id> void on_dynamic_precision(Id arg_id) {
+    template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
       f.precision_ref = make_arg_ref(arg_id);
     }
   };
@@ -1073,13 +1929,16 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
       else
         handler.on_error("precision not allowed for this argument type");
     }
-    end = parse_chrono_format(begin, end, detail::chrono_format_checker());
+    if (begin != end && *begin == 'L') {
+      ++begin;
+      localized = true;
+    }
+    end = detail::parse_chrono_format(begin, end,
+                                      detail::chrono_format_checker());
     return {begin, end};
   }
 
  public:
-  formatter() : precision(-1) {}
-
   FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
       -> decltype(ctx.begin()) {
     auto range = do_parse(ctx);
@@ -1089,30 +1948,112 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
   }
 
   template <typename FormatContext>
-  auto format(const duration& d, FormatContext& ctx) -> decltype(ctx.out()) {
+  auto format(const duration& d, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs_copy = specs;
+    auto precision_copy = precision;
     auto begin = format_str.begin(), end = format_str.end();
     // As a possible future optimization, we could avoid extra copying if width
     // is not specified.
     basic_memory_buffer<Char> buf;
     auto out = std::back_inserter(buf);
-    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref,
-                                                       ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+    detail::handle_dynamic_spec<detail::width_checker>(specs_copy.width,
+                                                       width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision_copy,
                                                            precision_ref, ctx);
     if (begin == end || *begin == '}') {
-      out = detail::format_duration_value<Char>(out, d.count(), precision);
+      out = detail::format_duration_value<Char>(out, d.count(), precision_copy);
       detail::format_duration_unit<Char, Period>(out);
     } else {
       detail::chrono_formatter<FormatContext, decltype(out), Rep, Period> f(
           ctx, out, d);
-      f.precision = precision;
-      parse_chrono_format(begin, end, f);
+      f.precision = precision_copy;
+      f.localized = localized;
+      detail::parse_chrono_format(begin, end, f);
     }
     return detail::write(
-        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs_copy);
   }
 };
 
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                 Char> : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->do_parse(default_specs,
+                   default_specs + sizeof(default_specs) / sizeof(Char));
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return this->do_parse(ctx.begin(), ctx.end(), true);
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
+  }
+
+  static constexpr const Char default_specs[] = {'%', 'F', ' ', '%', 'T'};
+};
+
+template <typename Char, typename Duration>
+constexpr const Char
+    formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+              Char>::default_specs[];
+
+template <typename Char> struct formatter<std::tm, Char> {
+ private:
+  enum class spec {
+    unknown,
+    year_month_day,
+    hh_mm_ss,
+  };
+  spec spec_ = spec::unknown;
+  basic_string_view<Char> specs;
+
+ protected:
+  template <typename It>
+  FMT_CONSTEXPR auto do_parse(It begin, It end, bool with_default = false)
+      -> It {
+    if (begin != end && *begin == ':') ++begin;
+    end = detail::parse_chrono_format(begin, end, detail::tm_format_checker());
+    if (!with_default || end != begin)
+      specs = {begin, detail::to_unsigned(end - begin)};
+    // basic_string_view<>::compare isn't constexpr before C++17.
+    if (specs.size() == 2 && specs[0] == Char('%')) {
+      if (specs[1] == Char('F'))
+        spec_ = spec::year_month_day;
+      else if (specs[1] == Char('T'))
+        spec_ = spec::hh_mm_ss;
+    }
+    return end;
+  }
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return this->do_parse(ctx.begin(), ctx.end());
+  }
+
+  template <typename FormatContext>
+  auto format(const std::tm& tm, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    const auto loc_ref = ctx.locale();
+    detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), tm);
+    if (spec_ == spec::year_month_day)
+      w.on_iso_date();
+    else if (spec_ == spec::hh_mm_ss)
+      w.on_iso_time();
+    else
+      detail::parse_chrono_format(specs.begin(), specs.end(), w);
+    return w.out();
+  }
+};
+
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_CHRONO_H_
diff --git a/src/fmt/color.h b/src/fmt/color.h
index 94e3419d1d..dfbe482938 100644
--- a/src/fmt/color.h
+++ b/src/fmt/color.h
@@ -10,7 +10,15 @@
 
 #include "format.h"
 
+// __declspec(deprecated) is broken in some MSVC versions.
+#if FMT_MSC_VER
+#  define FMT_DEPRECATED_NONMSVC
+#else
+#  define FMT_DEPRECATED_NONMSVC FMT_DEPRECATED
+#endif
+
 FMT_BEGIN_NAMESPACE
+FMT_MODULE_EXPORT_BEGIN
 
 enum class color : uint32_t {
   alice_blue = 0xF0F8FF,               // rgb(240,248,255)
@@ -177,9 +185,13 @@ enum class terminal_color : uint8_t {
 
 enum class emphasis : uint8_t {
   bold = 1,
-  italic = 1 << 1,
-  underline = 1 << 2,
-  strikethrough = 1 << 3
+  faint = 1 << 1,
+  italic = 1 << 2,
+  underline = 1 << 3,
+  blink = 1 << 4,
+  reverse = 1 << 5,
+  conceal = 1 << 6,
+  strikethrough = 1 << 7,
 };
 
 // rgb is a struct for red, green and blue colors.
@@ -198,7 +210,7 @@ struct rgb {
   uint8_t b;
 };
 
-namespace detail {
+FMT_BEGIN_DETAIL_NAMESPACE
 
 // color is a struct of either a rgb color or a terminal color.
 struct color_type {
@@ -221,9 +233,10 @@ struct color_type {
     uint32_t rgb_color;
   } value;
 };
-}  // namespace detail
 
-// Experimental text formatting support.
+FMT_END_DETAIL_NAMESPACE
+
+/** A text style consisting of foreground and background colors and emphasis. */
 class text_style {
  public:
   FMT_CONSTEXPR text_style(emphasis em = emphasis()) FMT_NOEXCEPT
@@ -260,33 +273,14 @@ class text_style {
     return lhs |= rhs;
   }
 
-  FMT_CONSTEXPR text_style& operator&=(const text_style& rhs) {
-    if (!set_foreground_color) {
-      set_foreground_color = rhs.set_foreground_color;
-      foreground_color = rhs.foreground_color;
-    } else if (rhs.set_foreground_color) {
-      if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
-        FMT_THROW(format_error("can't AND a terminal color"));
-      foreground_color.value.rgb_color &= rhs.foreground_color.value.rgb_color;
-    }
-
-    if (!set_background_color) {
-      set_background_color = rhs.set_background_color;
-      background_color = rhs.background_color;
-    } else if (rhs.set_background_color) {
-      if (!background_color.is_rgb || !rhs.background_color.is_rgb)
-        FMT_THROW(format_error("can't AND a terminal color"));
-      background_color.value.rgb_color &= rhs.background_color.value.rgb_color;
-    }
-
-    ems = static_cast<emphasis>(static_cast<uint8_t>(ems) &
-                                static_cast<uint8_t>(rhs.ems));
-    return *this;
+  FMT_DEPRECATED_NONMSVC FMT_CONSTEXPR text_style& operator&=(
+      const text_style& rhs) {
+    return and_assign(rhs);
   }
 
-  friend FMT_CONSTEXPR text_style operator&(text_style lhs,
-                                            const text_style& rhs) {
-    return lhs &= rhs;
+  FMT_DEPRECATED_NONMSVC friend FMT_CONSTEXPR text_style
+  operator&(text_style lhs, const text_style& rhs) {
+    return lhs.and_assign(rhs);
   }
 
   FMT_CONSTEXPR bool has_foreground() const FMT_NOEXCEPT {
@@ -326,8 +320,34 @@ class text_style {
     }
   }
 
+  // DEPRECATED!
+  FMT_CONSTEXPR text_style& and_assign(const text_style& rhs) {
+    if (!set_foreground_color) {
+      set_foreground_color = rhs.set_foreground_color;
+      foreground_color = rhs.foreground_color;
+    } else if (rhs.set_foreground_color) {
+      if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
+        FMT_THROW(format_error("can't AND a terminal color"));
+      foreground_color.value.rgb_color &= rhs.foreground_color.value.rgb_color;
+    }
+
+    if (!set_background_color) {
+      set_background_color = rhs.set_background_color;
+      background_color = rhs.background_color;
+    } else if (rhs.set_background_color) {
+      if (!background_color.is_rgb || !rhs.background_color.is_rgb)
+        FMT_THROW(format_error("can't AND a terminal color"));
+      background_color.value.rgb_color &= rhs.background_color.value.rgb_color;
+    }
+
+    ems = static_cast<emphasis>(static_cast<uint8_t>(ems) &
+                                static_cast<uint8_t>(rhs.ems));
+    return *this;
+  }
+
   friend FMT_CONSTEXPR_DECL text_style fg(detail::color_type foreground)
       FMT_NOEXCEPT;
+
   friend FMT_CONSTEXPR_DECL text_style bg(detail::color_type background)
       FMT_NOEXCEPT;
 
@@ -338,19 +358,22 @@ class text_style {
   emphasis ems;
 };
 
-FMT_CONSTEXPR text_style fg(detail::color_type foreground) FMT_NOEXCEPT {
-  return text_style(/*is_foreground=*/true, foreground);
+/** Creates a text style from the foreground (text) color. */
+FMT_CONSTEXPR inline text_style fg(detail::color_type foreground) FMT_NOEXCEPT {
+  return text_style(true, foreground);
 }
 
-FMT_CONSTEXPR text_style bg(detail::color_type background) FMT_NOEXCEPT {
-  return text_style(/*is_foreground=*/false, background);
+/** Creates a text style from the background color. */
+FMT_CONSTEXPR inline text_style bg(detail::color_type background) FMT_NOEXCEPT {
+  return text_style(false, background);
 }
 
-FMT_CONSTEXPR text_style operator|(emphasis lhs, emphasis rhs) FMT_NOEXCEPT {
+FMT_CONSTEXPR inline text_style operator|(emphasis lhs,
+                                          emphasis rhs) FMT_NOEXCEPT {
   return text_style(lhs) | rhs;
 }
 
-namespace detail {
+FMT_BEGIN_DETAIL_NAMESPACE
 
 template <typename Char> struct ansi_color_escape {
   FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
@@ -358,7 +381,7 @@ template <typename Char> struct ansi_color_escape {
     // If we have a terminal color, we need to output another escape code
     // sequence.
     if (!text_color.is_rgb) {
-      bool is_background = esc == detail::data::background_color;
+      bool is_background = esc == string_view("\x1b[48;2;");
       uint32_t value = text_color.value.term_color;
       // Background ASCII codes are the same as the foreground ones but with
       // 10 more.
@@ -390,16 +413,18 @@ template <typename Char> struct ansi_color_escape {
     buffer[19] = static_cast<Char>(0);
   }
   FMT_CONSTEXPR ansi_color_escape(emphasis em) FMT_NOEXCEPT {
-    uint8_t em_codes[4] = {};
-    uint8_t em_bits = static_cast<uint8_t>(em);
-    if (em_bits & static_cast<uint8_t>(emphasis::bold)) em_codes[0] = 1;
-    if (em_bits & static_cast<uint8_t>(emphasis::italic)) em_codes[1] = 3;
-    if (em_bits & static_cast<uint8_t>(emphasis::underline)) em_codes[2] = 4;
-    if (em_bits & static_cast<uint8_t>(emphasis::strikethrough))
-      em_codes[3] = 9;
+    uint8_t em_codes[num_emphases] = {};
+    if (has_emphasis(em, emphasis::bold)) em_codes[0] = 1;
+    if (has_emphasis(em, emphasis::faint)) em_codes[1] = 2;
+    if (has_emphasis(em, emphasis::italic)) em_codes[2] = 3;
+    if (has_emphasis(em, emphasis::underline)) em_codes[3] = 4;
+    if (has_emphasis(em, emphasis::blink)) em_codes[4] = 5;
+    if (has_emphasis(em, emphasis::reverse)) em_codes[5] = 7;
+    if (has_emphasis(em, emphasis::conceal)) em_codes[6] = 8;
+    if (has_emphasis(em, emphasis::strikethrough)) em_codes[7] = 9;
 
     size_t index = 0;
-    for (int i = 0; i < 4; ++i) {
+    for (size_t i = 0; i < num_emphases; ++i) {
       if (!em_codes[i]) continue;
       buffer[index++] = static_cast<Char>('\x1b');
       buffer[index++] = static_cast<Char>('[');
@@ -411,12 +436,13 @@ template <typename Char> struct ansi_color_escape {
   FMT_CONSTEXPR operator const Char*() const FMT_NOEXCEPT { return buffer; }
 
   FMT_CONSTEXPR const Char* begin() const FMT_NOEXCEPT { return buffer; }
-  FMT_CONSTEXPR const Char* end() const FMT_NOEXCEPT {
+  FMT_CONSTEXPR_CHAR_TRAITS const Char* end() const FMT_NOEXCEPT {
     return buffer + std::char_traits<Char>::length(buffer);
   }
 
  private:
-  Char buffer[7u + 3u * 4u + 1u];
+  static constexpr size_t num_emphases = 8;
+  Char buffer[7u + 3u * num_emphases + 1u];
 
   static FMT_CONSTEXPR void to_esc(uint8_t c, Char* out,
                                    char delimiter) FMT_NOEXCEPT {
@@ -425,18 +451,22 @@ template <typename Char> struct ansi_color_escape {
     out[2] = static_cast<Char>('0' + c % 10);
     out[3] = static_cast<Char>(delimiter);
   }
+  static FMT_CONSTEXPR bool has_emphasis(emphasis em,
+                                         emphasis mask) FMT_NOEXCEPT {
+    return static_cast<uint8_t>(em) & static_cast<uint8_t>(mask);
+  }
 };
 
 template <typename Char>
 FMT_CONSTEXPR ansi_color_escape<Char> make_foreground_color(
     detail::color_type foreground) FMT_NOEXCEPT {
-  return ansi_color_escape<Char>(foreground, detail::data::foreground_color);
+  return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
 }
 
 template <typename Char>
 FMT_CONSTEXPR ansi_color_escape<Char> make_background_color(
     detail::color_type background) FMT_NOEXCEPT {
-  return ansi_color_escape<Char>(background, detail::data::background_color);
+  return ansi_color_escape<Char>(background, "\x1b[48;2;");
 }
 
 template <typename Char>
@@ -455,18 +485,17 @@ inline void fputs<wchar_t>(const wchar_t* chars, FILE* stream) FMT_NOEXCEPT {
 }
 
 template <typename Char> inline void reset_color(FILE* stream) FMT_NOEXCEPT {
-  fputs(detail::data::reset_color, stream);
+  fputs("\x1b[0m", stream);
 }
 
 template <> inline void reset_color<wchar_t>(FILE* stream) FMT_NOEXCEPT {
-  fputs(detail::data::wreset_color, stream);
+  fputs(L"\x1b[0m", stream);
 }
 
 template <typename Char>
 inline void reset_color(buffer<Char>& buffer) FMT_NOEXCEPT {
-  const char* begin = data::reset_color;
-  const char* end = begin + sizeof(data::reset_color) - 1;
-  buffer.append(begin, end);
+  auto reset_color = string_view("\x1b[0m");
+  buffer.append(reset_color.begin(), reset_color.end());
 }
 
 template <typename Char>
@@ -489,10 +518,11 @@ void vformat_to(buffer<Char>& buf, const text_style& ts,
     auto background = detail::make_background_color<Char>(ts.get_background());
     buf.append(background.begin(), background.end());
   }
-  detail::vformat_to(buf, format_str, args);
+  detail::vformat_to(buf, format_str, args, {});
   if (has_style) detail::reset_color<Char>(buf);
 }
-}  // namespace detail
+
+FMT_END_DETAIL_NAMESPACE
 
 template <typename S, typename Char = char_t<S>>
 void vprint(std::FILE* f, const text_style& ts, const S& format,
@@ -523,11 +553,15 @@ void print(std::FILE* f, const text_style& ts, const S& format_str,
 }
 
 /**
+  \rst
   Formats a string and prints it to stdout using ANSI escape sequences to
   specify text formatting.
-  Example:
+
+  **Example**::
+
     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
                "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
  */
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_string<S>::value)>
@@ -559,8 +593,8 @@ inline std::basic_string<Char> vformat(
 template <typename S, typename... Args, typename Char = char_t<S>>
 inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
                                       const Args&... args) {
-  return vformat(ts, to_string_view(format_str),
-                 fmt::make_args_checked<Args...>(format_str, args...));
+  return fmt::vformat(ts, to_string_view(format_str),
+                      fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 /**
@@ -571,7 +605,7 @@ template <typename OutputIt, typename Char,
 OutputIt vformat_to(
     OutputIt out, const text_style& ts, basic_string_view<Char> format_str,
     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  auto&& buf = detail::get_buffer<Char>(out);
   detail::vformat_to(buf, ts, format_str, args);
   return detail::get_iterator(buf);
 }
@@ -598,6 +632,7 @@ inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
                     fmt::make_args_checked<Args...>(format_str, args...));
 }
 
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_COLOR_H_
diff --git a/src/fmt/compile.h b/src/fmt/compile.h
index 3a33b02014..1dba3ddb52 100644
--- a/src/fmt/compile.h
+++ b/src/fmt/compile.h
@@ -8,13 +8,135 @@
 #ifndef FMT_COMPILE_H_
 #define FMT_COMPILE_H_
 
-#include <vector>
-
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
+// An output iterator that counts the number of objects written to it and
+// discards them.
+class counting_iterator {
+ private:
+  size_t count_;
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  using _Unchecked_type = counting_iterator;  // Mark iterator as checked.
+
+  struct value_type {
+    template <typename T> void operator=(const T&) {}
+  };
+
+  counting_iterator() : count_(0) {}
+
+  size_t count() const { return count_; }
+
+  counting_iterator& operator++() {
+    ++count_;
+    return *this;
+  }
+  counting_iterator operator++(int) {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  friend counting_iterator operator+(counting_iterator it, difference_type n) {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
+  value_type operator*() const { return {}; }
+};
+
+template <typename Char, typename InputIt>
+inline counting_iterator copy_str(InputIt begin, InputIt end,
+                                  counting_iterator it) {
+  return it + (end - begin);
+}
+
+template <typename OutputIt> class truncating_iterator_base {
+ protected:
+  OutputIt out_;
+  size_t limit_;
+  size_t count_ = 0;
+
+  truncating_iterator_base() : out_(), limit_(0) {}
+
+  truncating_iterator_base(OutputIt out, size_t limit)
+      : out_(out), limit_(limit) {}
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using value_type = typename std::iterator_traits<OutputIt>::value_type;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  using _Unchecked_type =
+      truncating_iterator_base;  // Mark iterator as checked.
+
+  OutputIt base() const { return out_; }
+  size_t count() const { return count_; }
+};
+
+// An output iterator that truncates the output and counts the number of objects
+// written to it.
+template <typename OutputIt,
+          typename Enable = typename std::is_void<
+              typename std::iterator_traits<OutputIt>::value_type>::type>
+class truncating_iterator;
+
+template <typename OutputIt>
+class truncating_iterator<OutputIt, std::false_type>
+    : public truncating_iterator_base<OutputIt> {
+  mutable typename truncating_iterator_base<OutputIt>::value_type blackhole_;
+
+ public:
+  using value_type = typename truncating_iterator_base<OutputIt>::value_type;
+
+  truncating_iterator() = default;
+
+  truncating_iterator(OutputIt out, size_t limit)
+      : truncating_iterator_base<OutputIt>(out, limit) {}
+
+  truncating_iterator& operator++() {
+    if (this->count_++ < this->limit_) ++this->out_;
+    return *this;
+  }
+
+  truncating_iterator operator++(int) {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  value_type& operator*() const {
+    return this->count_ < this->limit_ ? *this->out_ : blackhole_;
+  }
+};
+
+template <typename OutputIt>
+class truncating_iterator<OutputIt, std::true_type>
+    : public truncating_iterator_base<OutputIt> {
+ public:
+  truncating_iterator() = default;
+
+  truncating_iterator(OutputIt out, size_t limit)
+      : truncating_iterator_base<OutputIt>(out, limit) {}
+
+  template <typename T> truncating_iterator& operator=(T val) {
+    if (this->count_++ < this->limit_) *this->out_++ = val;
+    return *this;
+  }
+
+  truncating_iterator& operator++() { return *this; }
+  truncating_iterator& operator++(int) { return *this; }
+  truncating_iterator& operator*() { return *this; }
+};
+
 // A compile-time string which is compiled into fast formatting code.
 class compiled_string {};
 
@@ -34,336 +156,30 @@ struct is_compiled_string : std::is_base_of<compiled_string, S> {};
     std::string s = fmt::format(FMT_COMPILE("{}"), 42);
   \endrst
  */
-#define FMT_COMPILE(s) FMT_STRING_IMPL(s, fmt::detail::compiled_string)
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+#  define FMT_COMPILE(s) \
+    FMT_STRING_IMPL(s, fmt::detail::compiled_string, explicit)
+#else
+#  define FMT_COMPILE(s) FMT_STRING(s)
+#endif
+
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_compiled_string : compiled_string {
+  using char_type = Char;
+  constexpr operator basic_string_view<char_type>() const {
+    return {Str.data, N - 1};
+  }
+};
+#endif
 
 template <typename T, typename... Tail>
 const T& first(const T& value, const Tail&...) {
   return value;
 }
 
-// Part of a compiled format string. It can be either literal text or a
-// replacement field.
-template <typename Char> struct format_part {
-  enum class kind { arg_index, arg_name, text, replacement };
-
-  struct replacement {
-    arg_ref<Char> arg_id;
-    dynamic_format_specs<Char> specs;
-  };
-
-  kind part_kind;
-  union value {
-    int arg_index;
-    basic_string_view<Char> str;
-    replacement repl;
-
-    FMT_CONSTEXPR value(int index = 0) : arg_index(index) {}
-    FMT_CONSTEXPR value(basic_string_view<Char> s) : str(s) {}
-    FMT_CONSTEXPR value(replacement r) : repl(r) {}
-  } val;
-  // Position past the end of the argument id.
-  const Char* arg_id_end = nullptr;
-
-  FMT_CONSTEXPR format_part(kind k = kind::arg_index, value v = {})
-      : part_kind(k), val(v) {}
-
-  static FMT_CONSTEXPR format_part make_arg_index(int index) {
-    return format_part(kind::arg_index, index);
-  }
-  static FMT_CONSTEXPR format_part make_arg_name(basic_string_view<Char> name) {
-    return format_part(kind::arg_name, name);
-  }
-  static FMT_CONSTEXPR format_part make_text(basic_string_view<Char> text) {
-    return format_part(kind::text, text);
-  }
-  static FMT_CONSTEXPR format_part make_replacement(replacement repl) {
-    return format_part(kind::replacement, repl);
-  }
-};
-
-template <typename Char> struct part_counter {
-  unsigned num_parts = 0;
-
-  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    if (begin != end) ++num_parts;
-  }
-
-  FMT_CONSTEXPR int on_arg_id() { return ++num_parts, 0; }
-  FMT_CONSTEXPR int on_arg_id(int) { return ++num_parts, 0; }
-  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char>) {
-    return ++num_parts, 0;
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
-
-  FMT_CONSTEXPR const Char* on_format_specs(int, const Char* begin,
-                                            const Char* end) {
-    // Find the matching brace.
-    unsigned brace_counter = 0;
-    for (; begin != end; ++begin) {
-      if (*begin == '{') {
-        ++brace_counter;
-      } else if (*begin == '}') {
-        if (brace_counter == 0u) break;
-        --brace_counter;
-      }
-    }
-    return begin;
-  }
-
-  FMT_CONSTEXPR void on_error(const char*) {}
-};
-
-// Counts the number of parts in a format string.
-template <typename Char>
-FMT_CONSTEXPR unsigned count_parts(basic_string_view<Char> format_str) {
-  part_counter<Char> counter;
-  parse_format_string<true>(format_str, counter);
-  return counter.num_parts;
-}
-
-template <typename Char, typename PartHandler>
-class format_string_compiler : public error_handler {
- private:
-  using part = format_part<Char>;
-
-  PartHandler handler_;
-  part part_;
-  basic_string_view<Char> format_str_;
-  basic_format_parse_context<Char> parse_context_;
-
- public:
-  FMT_CONSTEXPR format_string_compiler(basic_string_view<Char> format_str,
-                                       PartHandler handler)
-      : handler_(handler),
-        format_str_(format_str),
-        parse_context_(format_str) {}
-
-  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    if (begin != end)
-      handler_(part::make_text({begin, to_unsigned(end - begin)}));
-  }
-
-  FMT_CONSTEXPR int on_arg_id() {
-    part_ = part::make_arg_index(parse_context_.next_arg_id());
-    return 0;
-  }
-
-  FMT_CONSTEXPR int on_arg_id(int id) {
-    parse_context_.check_arg_id(id);
-    part_ = part::make_arg_index(id);
-    return 0;
-  }
-
-  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char> id) {
-    part_ = part::make_arg_name(id);
-    return 0;
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(int, const Char* ptr) {
-    part_.arg_id_end = ptr;
-    handler_(part_);
-  }
-
-  FMT_CONSTEXPR const Char* on_format_specs(int, const Char* begin,
-                                            const Char* end) {
-    auto repl = typename part::replacement();
-    dynamic_specs_handler<basic_format_parse_context<Char>> handler(
-        repl.specs, parse_context_);
-    auto it = parse_format_specs(begin, end, handler);
-    if (*it != '}') on_error("missing '}' in format string");
-    repl.arg_id = part_.part_kind == part::kind::arg_index
-                      ? arg_ref<Char>(part_.val.arg_index)
-                      : arg_ref<Char>(part_.val.str);
-    auto part = part::make_replacement(repl);
-    part.arg_id_end = begin;
-    handler_(part);
-    return it;
-  }
-};
-
-// Compiles a format string and invokes handler(part) for each parsed part.
-template <bool IS_CONSTEXPR, typename Char, typename PartHandler>
-FMT_CONSTEXPR void compile_format_string(basic_string_view<Char> format_str,
-                                         PartHandler handler) {
-  parse_format_string<IS_CONSTEXPR>(
-      format_str,
-      format_string_compiler<Char, PartHandler>(format_str, handler));
-}
-
-template <typename OutputIt, typename Context, typename Id>
-void format_arg(
-    basic_format_parse_context<typename Context::char_type>& parse_ctx,
-    Context& ctx, Id arg_id) {
-  ctx.advance_to(visit_format_arg(
-      arg_formatter<OutputIt, typename Context::char_type>(ctx, &parse_ctx),
-      ctx.arg(arg_id)));
-}
-
-// vformat_to is defined in a subnamespace to prevent ADL.
-namespace cf {
-template <typename Context, typename OutputIt, typename CompiledFormat>
-auto vformat_to(OutputIt out, CompiledFormat& cf,
-                basic_format_args<Context> args) -> typename Context::iterator {
-  using char_type = typename Context::char_type;
-  basic_format_parse_context<char_type> parse_ctx(
-      to_string_view(cf.format_str_));
-  Context ctx(out, args);
-
-  const auto& parts = cf.parts();
-  for (auto part_it = std::begin(parts); part_it != std::end(parts);
-       ++part_it) {
-    const auto& part = *part_it;
-    const auto& value = part.val;
-
-    using format_part_t = format_part<char_type>;
-    switch (part.part_kind) {
-    case format_part_t::kind::text: {
-      const auto text = value.str;
-      auto output = ctx.out();
-      auto&& it = reserve(output, text.size());
-      it = std::copy_n(text.begin(), text.size(), it);
-      ctx.advance_to(output);
-      break;
-    }
-
-    case format_part_t::kind::arg_index:
-      advance_to(parse_ctx, part.arg_id_end);
-      detail::format_arg<OutputIt>(parse_ctx, ctx, value.arg_index);
-      break;
-
-    case format_part_t::kind::arg_name:
-      advance_to(parse_ctx, part.arg_id_end);
-      detail::format_arg<OutputIt>(parse_ctx, ctx, value.str);
-      break;
-
-    case format_part_t::kind::replacement: {
-      const auto& arg_id_value = value.repl.arg_id.val;
-      const auto arg = value.repl.arg_id.kind == arg_id_kind::index
-                           ? ctx.arg(arg_id_value.index)
-                           : ctx.arg(arg_id_value.name);
-
-      auto specs = value.repl.specs;
-
-      handle_dynamic_spec<width_checker>(specs.width, specs.width_ref, ctx);
-      handle_dynamic_spec<precision_checker>(specs.precision,
-                                             specs.precision_ref, ctx);
-
-      error_handler h;
-      numeric_specs_checker<error_handler> checker(h, arg.type());
-      if (specs.align == align::numeric) checker.require_numeric_argument();
-      if (specs.sign != sign::none) checker.check_sign();
-      if (specs.alt) checker.require_numeric_argument();
-      if (specs.precision >= 0) checker.check_precision();
-
-      advance_to(parse_ctx, part.arg_id_end);
-      ctx.advance_to(
-          visit_format_arg(arg_formatter<OutputIt, typename Context::char_type>(
-                               ctx, nullptr, &specs),
-                           arg));
-      break;
-    }
-    }
-  }
-  return ctx.out();
-}
-}  // namespace cf
-
-struct basic_compiled_format {};
-
-template <typename S, typename = void>
-struct compiled_format_base : basic_compiled_format {
-  using char_type = char_t<S>;
-  using parts_container = std::vector<detail::format_part<char_type>>;
-
-  parts_container compiled_parts;
-
-  explicit compiled_format_base(basic_string_view<char_type> format_str) {
-    compile_format_string<false>(format_str,
-                                 [this](const format_part<char_type>& part) {
-                                   compiled_parts.push_back(part);
-                                 });
-  }
-
-  const parts_container& parts() const { return compiled_parts; }
-};
-
-template <typename Char, unsigned N> struct format_part_array {
-  format_part<Char> data[N] = {};
-  FMT_CONSTEXPR format_part_array() = default;
-};
-
-template <typename Char, unsigned N>
-FMT_CONSTEXPR format_part_array<Char, N> compile_to_parts(
-    basic_string_view<Char> format_str) {
-  format_part_array<Char, N> parts;
-  unsigned counter = 0;
-  // This is not a lambda for compatibility with older compilers.
-  struct {
-    format_part<Char>* parts;
-    unsigned* counter;
-    FMT_CONSTEXPR void operator()(const format_part<Char>& part) {
-      parts[(*counter)++] = part;
-    }
-  } collector{parts.data, &counter};
-  compile_format_string<true>(format_str, collector);
-  if (counter < N) {
-    parts.data[counter] =
-        format_part<Char>::make_text(basic_string_view<Char>());
-  }
-  return parts;
-}
-
-template <typename T> constexpr const T& constexpr_max(const T& a, const T& b) {
-  return (a < b) ? b : a;
-}
-
-template <typename S>
-struct compiled_format_base<S, enable_if_t<is_compile_string<S>::value>>
-    : basic_compiled_format {
-  using char_type = char_t<S>;
-
-  FMT_CONSTEXPR explicit compiled_format_base(basic_string_view<char_type>) {}
-
-// Workaround for old compilers. Format string compilation will not be
-// performed there anyway.
-#if FMT_USE_CONSTEXPR
-  static FMT_CONSTEXPR_DECL const unsigned num_format_parts =
-      constexpr_max(count_parts(to_string_view(S())), 1u);
-#else
-  static const unsigned num_format_parts = 1;
-#endif
-
-  using parts_container = format_part<char_type>[num_format_parts];
-
-  const parts_container& parts() const {
-    static FMT_CONSTEXPR_DECL const auto compiled_parts =
-        compile_to_parts<char_type, num_format_parts>(
-            detail::to_string_view(S()));
-    return compiled_parts.data;
-  }
-};
-
-template <typename S, typename... Args>
-class compiled_format : private compiled_format_base<S> {
- public:
-  using typename compiled_format_base<S>::char_type;
-
- private:
-  basic_string_view<char_type> format_str_;
-
-  template <typename Context, typename OutputIt, typename CompiledFormat>
-  friend auto cf::vformat_to(OutputIt out, CompiledFormat& cf,
-                             basic_format_args<Context> args) ->
-      typename Context::iterator;
-
- public:
-  compiled_format() = delete;
-  explicit constexpr compiled_format(basic_string_view<char_type> format_str)
-      : compiled_format_base<S>(format_str), format_str_(format_str) {}
-};
-
-#ifdef __cpp_if_constexpr
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
 template <typename... Args> struct type_list {};
 
 // Returns a reference to the argument at index N from [first, rest...].
@@ -374,13 +190,20 @@ constexpr const auto& get([[maybe_unused]] const T& first,
   if constexpr (N == 0)
     return first;
   else
-    return get<N - 1>(rest...);
+    return detail::get<N - 1>(rest...);
+}
+
+template <typename Char, typename... Args>
+constexpr int get_arg_index_by_name(basic_string_view<Char> name,
+                                    type_list<Args...>) {
+  return get_arg_index_by_name<Args...>(name);
 }
 
 template <int N, typename> struct get_type_impl;
 
 template <int N, typename... Args> struct get_type_impl<N, type_list<Args...>> {
-  using type = remove_cvref_t<decltype(get<N>(std::declval<Args>()...))>;
+  using type =
+      remove_cvref_t<decltype(detail::get<N>(std::declval<Args>()...))>;
 };
 
 template <int N, typename T>
@@ -393,7 +216,7 @@ template <typename Char> struct text {
   using char_type = Char;
 
   template <typename OutputIt, typename... Args>
-  OutputIt format(OutputIt out, const Args&...) const {
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
     return write<Char>(out, data);
   }
 };
@@ -412,11 +235,22 @@ template <typename Char> struct code_unit {
   using char_type = Char;
 
   template <typename OutputIt, typename... Args>
-  OutputIt format(OutputIt out, const Args&...) const {
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
     return write<Char>(out, value);
   }
 };
 
+// This ensures that the argument type is convertible to `const T&`.
+template <typename T, int N, typename... Args>
+constexpr const T& get_arg_checked(const Args&... args) {
+  const auto& arg = detail::get<N>(args...);
+  if constexpr (detail::is_named_arg<remove_cvref_t<decltype(arg)>>()) {
+    return arg.value;
+  } else {
+    return arg;
+  }
+}
+
 template <typename Char>
 struct is_compiled_format<code_unit<Char>> : std::true_type {};
 
@@ -425,29 +259,58 @@ template <typename Char, typename T, int N> struct field {
   using char_type = Char;
 
   template <typename OutputIt, typename... Args>
-  OutputIt format(OutputIt out, const Args&... args) const {
-    // This ensures that the argument type is convertile to `const T&`.
-    const T& arg = get<N>(args...);
-    return write<Char>(out, arg);
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    return write<Char>(out, get_arg_checked<T, N>(args...));
   }
 };
 
 template <typename Char, typename T, int N>
 struct is_compiled_format<field<Char, T, N>> : std::true_type {};
 
+// A replacement field that refers to argument with name.
+template <typename Char> struct runtime_named_field {
+  using char_type = Char;
+  basic_string_view<Char> name;
+
+  template <typename OutputIt, typename T>
+  constexpr static bool try_format_argument(
+      OutputIt& out,
+      // [[maybe_unused]] due to unused-but-set-parameter warning in GCC 7,8,9
+      [[maybe_unused]] basic_string_view<Char> arg_name, const T& arg) {
+    if constexpr (is_named_arg<typename std::remove_cv<T>::type>::value) {
+      if (arg_name == arg.name) {
+        out = write<Char>(out, arg.value);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    bool found = (try_format_argument(out, name, args) || ...);
+    if (!found) {
+      FMT_THROW(format_error("argument with specified name is not found"));
+    }
+    return out;
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<runtime_named_field<Char>> : std::true_type {};
+
 // A replacement field that refers to argument N and has format specifiers.
 template <typename Char, typename T, int N> struct spec_field {
   using char_type = Char;
-  mutable formatter<T, Char> fmt;
+  formatter<T, Char> fmt;
 
   template <typename OutputIt, typename... Args>
-  OutputIt format(OutputIt out, const Args&... args) const {
-    // This ensures that the argument type is convertile to `const T&`.
-    const T& arg = get<N>(args...);
+  constexpr FMT_INLINE OutputIt format(OutputIt out,
+                                       const Args&... args) const {
     const auto& vargs =
-        make_format_args<basic_format_context<OutputIt, Char>>(args...);
+        fmt::make_format_args<basic_format_context<OutputIt, Char>>(args...);
     basic_format_context<OutputIt, Char> ctx(out, vargs);
-    return fmt.format(arg, ctx);
+    return fmt.format(get_arg_checked<T, N>(args...), ctx);
   }
 };
 
@@ -460,7 +323,7 @@ template <typename L, typename R> struct concat {
   using char_type = typename L::char_type;
 
   template <typename OutputIt, typename... Args>
-  OutputIt format(OutputIt out, const Args&... args) const {
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
     out = lhs.format(out, args...);
     return rhs.format(out, args...);
   }
@@ -508,14 +371,79 @@ template <typename T, typename Char> struct parse_specs_result {
   int next_arg_id;
 };
 
+constexpr int manual_indexing_id = -1;
+
 template <typename T, typename Char>
 constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
-                                                  size_t pos, int arg_id) {
+                                                  size_t pos, int next_arg_id) {
   str.remove_prefix(pos);
-  auto ctx = basic_format_parse_context<Char>(str, {}, arg_id + 1);
+  auto ctx = basic_format_parse_context<Char>(str, {}, next_arg_id);
   auto f = formatter<T, Char>();
   auto end = f.parse(ctx);
-  return {f, pos + (end - str.data()) + 1, ctx.next_arg_id()};
+  return {f, pos + fmt::detail::to_unsigned(end - str.data()) + 1,
+          next_arg_id == 0 ? manual_indexing_id : ctx.next_arg_id()};
+}
+
+template <typename Char> struct arg_id_handler {
+  arg_ref<Char> arg_id;
+
+  constexpr int operator()() {
+    FMT_ASSERT(false, "handler cannot be used with automatic indexing");
+    return 0;
+  }
+  constexpr int operator()(int id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+  constexpr int operator()(basic_string_view<Char> id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+
+  constexpr void on_error(const char* message) {
+    FMT_THROW(format_error(message));
+  }
+};
+
+template <typename Char> struct parse_arg_id_result {
+  arg_ref<Char> arg_id;
+  const Char* arg_id_end;
+};
+
+template <int ID, typename Char>
+constexpr auto parse_arg_id(const Char* begin, const Char* end) {
+  auto handler = arg_id_handler<Char>{arg_ref<Char>{}};
+  auto arg_id_end = parse_arg_id(begin, end, handler);
+  return parse_arg_id_result<Char>{handler.arg_id, arg_id_end};
+}
+
+template <typename T, typename Enable = void> struct field_type {
+  using type = remove_cvref_t<T>;
+};
+
+template <typename T>
+struct field_type<T, enable_if_t<detail::is_named_arg<T>::value>> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename Args, size_t END_POS, int ARG_INDEX, int NEXT_ID,
+          typename S>
+constexpr auto parse_replacement_field_then_tail(S format_str) {
+  using char_type = typename S::char_type;
+  constexpr auto str = basic_string_view<char_type>(format_str);
+  constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type();
+  if constexpr (c == '}') {
+    return parse_tail<Args, END_POS + 1, NEXT_ID>(
+        field<char_type, typename field_type<T>::type, ARG_INDEX>(),
+        format_str);
+  } else if constexpr (c == ':') {
+    constexpr auto result = parse_specs<typename field_type<T>::type>(
+        str, END_POS + 1, NEXT_ID == manual_indexing_id ? 0 : NEXT_ID);
+    return parse_tail<Args, result.end, result.next_arg_id>(
+        spec_field<char_type, typename field_type<T>::type, ARG_INDEX>{
+            result.fmt},
+        format_str);
+  }
 }
 
 // Compiles a non-empty format string and returns the compiled representation
@@ -523,27 +451,59 @@ constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
 template <typename Args, size_t POS, int ID, typename S>
 constexpr auto compile_format_string(S format_str) {
   using char_type = typename S::char_type;
-  constexpr basic_string_view<char_type> str = format_str;
+  constexpr auto str = basic_string_view<char_type>(format_str);
   if constexpr (str[POS] == '{') {
-    if (POS + 1 == str.size())
-      throw format_error("unmatched '{' in format string");
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '{' in format string"));
     if constexpr (str[POS + 1] == '{') {
       return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
-    } else if constexpr (str[POS + 1] == '}') {
-      using type = get_type<ID, Args>;
-      return parse_tail<Args, POS + 2, ID + 1>(field<char_type, type, ID>(),
-                                               format_str);
-    } else if constexpr (str[POS + 1] == ':') {
-      using type = get_type<ID, Args>;
-      constexpr auto result = parse_specs<type>(str, POS + 2, ID);
-      return parse_tail<Args, result.end, result.next_arg_id>(
-          spec_field<char_type, type, ID>{result.fmt}, format_str);
+    } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') {
+      static_assert(ID != manual_indexing_id,
+                    "cannot switch from manual to automatic argument indexing");
+      constexpr auto next_id =
+          ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+      return parse_replacement_field_then_tail<get_type<ID, Args>, Args,
+                                               POS + 1, ID, next_id>(
+          format_str);
     } else {
-      return unknown_format();
+      constexpr auto arg_id_result =
+          parse_arg_id<ID>(str.data() + POS + 1, str.data() + str.size());
+      constexpr auto arg_id_end_pos = arg_id_result.arg_id_end - str.data();
+      constexpr char_type c =
+          arg_id_end_pos != str.size() ? str[arg_id_end_pos] : char_type();
+      static_assert(c == '}' || c == ':', "missing '}' in format string");
+      if constexpr (arg_id_result.arg_id.kind == arg_id_kind::index) {
+        static_assert(
+            ID == manual_indexing_id || ID == 0,
+            "cannot switch from automatic to manual argument indexing");
+        constexpr auto arg_index = arg_id_result.arg_id.val.index;
+        return parse_replacement_field_then_tail<get_type<arg_index, Args>,
+                                                 Args, arg_id_end_pos,
+                                                 arg_index, manual_indexing_id>(
+            format_str);
+      } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) {
+        constexpr auto arg_index =
+            get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{});
+        if constexpr (arg_index != invalid_arg_index) {
+          constexpr auto next_id =
+              ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+          return parse_replacement_field_then_tail<
+              decltype(get_type<arg_index, Args>::value), Args, arg_id_end_pos,
+              arg_index, next_id>(format_str);
+        } else {
+          if constexpr (c == '}') {
+            return parse_tail<Args, arg_id_end_pos + 1, ID>(
+                runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
+                format_str);
+          } else if constexpr (c == ':') {
+            return unknown_format();  // no type info for specs parsing
+          }
+        }
+      }
     }
   } else if constexpr (str[POS] == '}') {
-    if (POS + 1 == str.size())
-      throw format_error("unmatched '}' in format string");
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '}' in format string"));
     return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
   } else {
     constexpr auto end = parse_text(str, POS + 1);
@@ -558,144 +518,125 @@ constexpr auto compile_format_string(S format_str) {
 }
 
 template <typename... Args, typename S,
-          FMT_ENABLE_IF(is_compile_string<S>::value ||
-                        detail::is_compiled_string<S>::value)>
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
 constexpr auto compile(S format_str) {
-  constexpr basic_string_view<typename S::char_type> str = format_str;
+  constexpr auto str = basic_string_view<typename S::char_type>(format_str);
   if constexpr (str.size() == 0) {
     return detail::make_text(str, 0, 0);
   } else {
     constexpr auto result =
         detail::compile_format_string<detail::type_list<Args...>, 0, 0>(
             format_str);
-    if constexpr (std::is_same<remove_cvref_t<decltype(result)>,
-                               detail::unknown_format>()) {
-      return detail::compiled_format<S, Args...>(to_string_view(format_str));
-    } else {
-      return result;
-    }
+    return result;
   }
 }
-#else
-template <typename... Args, typename S,
-          FMT_ENABLE_IF(is_compile_string<S>::value)>
-constexpr auto compile(S format_str) -> detail::compiled_format<S, Args...> {
-  return detail::compiled_format<S, Args...>(to_string_view(format_str));
-}
-#endif  // __cpp_if_constexpr
-
-// Compiles the format string which must be a string literal.
-template <typename... Args, typename Char, size_t N>
-auto compile(const Char (&format_str)[N])
-    -> detail::compiled_format<const Char*, Args...> {
-  return detail::compiled_format<const Char*, Args...>(
-      basic_string_view<Char>(format_str, N - 1));
-}
+#endif  // defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
 }  // namespace detail
 
-// DEPRECATED! use FMT_COMPILE instead.
-template <typename... Args>
-FMT_DEPRECATED auto compile(const Args&... args)
-    -> decltype(detail::compile(args...)) {
-  return detail::compile(args...);
-}
+FMT_MODULE_EXPORT_BEGIN
 
-#if FMT_USE_CONSTEXPR
-#  ifdef __cpp_if_constexpr
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
 
 template <typename CompiledFormat, typename... Args,
           typename Char = typename CompiledFormat::char_type,
           FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
 FMT_INLINE std::basic_string<Char> format(const CompiledFormat& cf,
                                           const Args&... args) {
-  basic_memory_buffer<Char> buffer;
-  cf.format(detail::buffer_appender<Char>(buffer), args...);
-  return to_string(buffer);
+  auto s = std::basic_string<Char>();
+  cf.format(std::back_inserter(s), args...);
+  return s;
 }
 
 template <typename OutputIt, typename CompiledFormat, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
-OutputIt format_to(OutputIt out, const CompiledFormat& cf,
-                   const Args&... args) {
+constexpr FMT_INLINE OutputIt format_to(OutputIt out, const CompiledFormat& cf,
+                                        const Args&... args) {
   return cf.format(out, args...);
 }
-#  endif  // __cpp_if_constexpr
-#endif    // FMT_USE_CONSTEXPR
-
-template <typename CompiledFormat, typename... Args,
-          typename Char = typename CompiledFormat::char_type,
-          FMT_ENABLE_IF(std::is_base_of<detail::basic_compiled_format,
-                                        CompiledFormat>::value)>
-std::basic_string<Char> format(const CompiledFormat& cf, const Args&... args) {
-  basic_memory_buffer<Char> buffer;
-  using context = buffer_context<Char>;
-  detail::cf::vformat_to<context>(detail::buffer_appender<Char>(buffer), cf,
-                                  make_format_args<context>(args...));
-  return to_string(buffer);
-}
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
 FMT_INLINE std::basic_string<typename S::char_type> format(const S&,
                                                            Args&&... args) {
-#ifdef __cpp_if_constexpr
   if constexpr (std::is_same<typename S::char_type, char>::value) {
-    constexpr basic_string_view<typename S::char_type> str = S();
-    if (str.size() == 2 && str[0] == '{' && str[1] == '}')
-      return fmt::to_string(detail::first(args...));
+    constexpr auto str = basic_string_view<typename S::char_type>(S());
+    if constexpr (str.size() == 2 && str[0] == '{' && str[1] == '}') {
+      const auto& first = detail::first(args...);
+      if constexpr (detail::is_named_arg<
+                        remove_cvref_t<decltype(first)>>::value) {
+        return fmt::to_string(first.value);
+      } else {
+        return fmt::to_string(first);
+      }
+    }
   }
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return format(static_cast<basic_string_view<typename S::char_type>>(S()),
+                  std::forward<Args>(args)...);
+  } else {
+    return format(compiled, std::forward<Args>(args)...);
+  }
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return format_to(out,
+                     static_cast<basic_string_view<typename S::char_type>>(S()),
+                     std::forward<Args>(args)...);
+  } else {
+    return format_to(out, compiled, std::forward<Args>(args)...);
+  }
+}
 #endif
-  constexpr auto compiled = detail::compile<Args...>(S());
-  return format(compiled, std::forward<Args>(args)...);
-}
-
-template <typename OutputIt, typename CompiledFormat, typename... Args,
-          FMT_ENABLE_IF(std::is_base_of<detail::basic_compiled_format,
-                                        CompiledFormat>::value)>
-OutputIt format_to(OutputIt out, const CompiledFormat& cf,
-                   const Args&... args) {
-  using char_type = typename CompiledFormat::char_type;
-  using context = format_context_t<OutputIt, char_type>;
-  return detail::cf::vformat_to<context>(out, cf,
-                                         make_format_args<context>(args...));
-}
 
 template <typename OutputIt, typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-OutputIt format_to(OutputIt out, const S&, const Args&... args) {
-  constexpr auto compiled = detail::compile<Args...>(S());
-  return format_to(out, compiled, args...);
-}
-
-template <typename OutputIt, typename CompiledFormat, typename... Args>
-auto format_to_n(OutputIt out, size_t n, const CompiledFormat& cf,
-                 const Args&... args) ->
-    typename std::enable_if<
-        detail::is_output_iterator<OutputIt,
-                                   typename CompiledFormat::char_type>::value &&
-            std::is_base_of<detail::basic_compiled_format,
-                            CompiledFormat>::value,
-        format_to_n_result<OutputIt>>::type {
-  auto it =
-      format_to(detail::truncating_iterator<OutputIt>(out, n), cf, args...);
+format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
+                                         const S& format_str, Args&&... args) {
+  auto it = format_to(detail::truncating_iterator<OutputIt>(out, n), format_str,
+                      std::forward<Args>(args)...);
   return {it.base(), it.count()};
 }
 
-template <typename OutputIt, typename S, typename... Args,
+template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n, const S&,
-                                         const Args&... args) {
-  constexpr auto compiled = detail::compile<Args...>(S());
-  auto it = format_to(detail::truncating_iterator<OutputIt>(out, n), compiled,
-                      args...);
-  return {it.base(), it.count()};
+size_t formatted_size(const S& format_str, const Args&... args) {
+  return format_to(detail::counting_iterator(), format_str, args...).count();
 }
 
-template <typename CompiledFormat, typename... Args>
-size_t formatted_size(const CompiledFormat& cf, const Args&... args) {
-  return format_to(detail::counting_iterator(), cf, args...).count();
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(std::FILE* f, const S& format_str, const Args&... args) {
+  memory_buffer buffer;
+  format_to(std::back_inserter(buffer), format_str, args...);
+  detail::print(f, {buffer.data(), buffer.size()});
 }
 
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(const S& format_str, const Args&... args) {
+  print(stdout, format_str, args...);
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+inline namespace literals {
+template <detail_exported::fixed_string Str>
+constexpr detail::udl_compiled_string<
+    remove_cvref_t<decltype(Str.data[0])>,
+    sizeof(Str.data) / sizeof(decltype(Str.data[0])), Str>
+operator""_cf() {
+  return {};
+}
+}  // namespace literals
+#endif
+
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_COMPILE_H_
diff --git a/src/fmt/core.h b/src/fmt/core.h
index 7946921d8e..b7f9a960c6 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -1,4 +1,4 @@
-// Formatting library for C++ - the core API
+// Formatting library for C++ - the core API for char/UTF-8
 //
 // Copyright (c) 2012 - present, Victor Zverovich
 // All rights reserved.
@@ -8,42 +8,47 @@
 #ifndef FMT_CORE_H_
 #define FMT_CORE_H_
 
-#include <cstdio>  // std::FILE
+#include <cstddef>  // std::byte
+#include <cstdio>   // std::FILE
 #include <cstring>
-#include <functional>
 #include <iterator>
-#include <memory>
+#include <limits>
 #include <string>
 #include <type_traits>
-#include <vector>
 
 // The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 70103
+#define FMT_VERSION 80101
 
-#if defined (__clang__ ) && !defined(__ibmxl__)
+#if defined(__clang__) && !defined(__ibmxl__)
 #  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
 #else
 #  define FMT_CLANG_VERSION 0
 #endif
 
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
+    !defined(__NVCOMPILER)
 #  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #else
 #  define FMT_GCC_VERSION 0
 #endif
 
-#if defined(__INTEL_COMPILER)
+#ifndef FMT_GCC_PRAGMA
+// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884.
+#  if FMT_GCC_VERSION >= 504
+#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
+#  else
+#    define FMT_GCC_PRAGMA(arg)
+#  endif
+#endif
+
+#ifdef __ICL
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
 #  define FMT_ICC_VERSION __INTEL_COMPILER
 #else
 #  define FMT_ICC_VERSION 0
 #endif
 
-#if __cplusplus >= 201103L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#  define FMT_HAS_GXX_CXX11 FMT_GCC_VERSION
-#else
-#  define FMT_HAS_GXX_CXX11 0
-#endif
-
 #ifdef __NVCC__
 #  define FMT_NVCC __NVCC__
 #else
@@ -52,10 +57,10 @@
 
 #ifdef _MSC_VER
 #  define FMT_MSC_VER _MSC_VER
-#  define FMT_SUPPRESS_MSC_WARNING(n) __pragma(warning(suppress : n))
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
 #else
 #  define FMT_MSC_VER 0
-#  define FMT_SUPPRESS_MSC_WARNING(n)
+#  define FMT_MSC_WARNING(...)
 #endif
 
 #ifdef __has_feature
@@ -64,7 +69,8 @@
 #  define FMT_HAS_FEATURE(x) 0
 #endif
 
-#if defined(__has_include) && !defined(__INTELLISENSE__) && \
+#if defined(__has_include) &&                             \
+    (!defined(__INTELLISENSE__) || FMT_MSC_VER > 1900) && \
     (!FMT_ICC_VERSION || FMT_ICC_VERSION >= 1600)
 #  define FMT_HAS_INCLUDE(x) __has_include(x)
 #else
@@ -77,17 +83,23 @@
 #  define FMT_HAS_CPP_ATTRIBUTE(x) 0
 #endif
 
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
 #define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
-  (__cplusplus >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
 
 #define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
-  (__cplusplus >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
 
 // Check if relaxed C++14 constexpr is supported.
 // GCC doesn't allow throw in constexpr until version 6 (bug 67371).
 #ifndef FMT_USE_CONSTEXPR
 #  define FMT_USE_CONSTEXPR                                           \
-    (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VER >= 1910 || \
+    (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VER >= 1912 || \
      (FMT_GCC_VERSION >= 600 && __cplusplus >= 201402L)) &&           \
         !FMT_NVCC && !FMT_ICC_VERSION
 #endif
@@ -95,17 +107,32 @@
 #  define FMT_CONSTEXPR constexpr
 #  define FMT_CONSTEXPR_DECL constexpr
 #else
-#  define FMT_CONSTEXPR inline
+#  define FMT_CONSTEXPR
 #  define FMT_CONSTEXPR_DECL
 #endif
 
-#ifndef FMT_OVERRIDE
-#  if FMT_HAS_FEATURE(cxx_override_control) || \
-      (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
-#    define FMT_OVERRIDE override
-#  else
-#    define FMT_OVERRIDE
+#if ((__cplusplus >= 202002L) &&                              \
+     (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE > 9)) || \
+    (__cplusplus >= 201709L && FMT_GCC_VERSION >= 1002)
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEXPR20
+#endif
+
+// Check if constexpr std::char_traits<>::compare,length is supported.
+#if defined(__GLIBCXX__)
+#  if __cplusplus >= 201703L && defined(_GLIBCXX_RELEASE) && \
+      _GLIBCXX_RELEASE >= 7  // GCC 7+ libstdc++ has _GLIBCXX_RELEASE.
+#    define FMT_CONSTEXPR_CHAR_TRAITS constexpr
 #  endif
+#elif defined(_LIBCPP_VERSION) && __cplusplus >= 201703L && \
+    _LIBCPP_VERSION >= 4000
+#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
+#elif FMT_MSC_VER >= 1914 && _MSVC_LANG >= 201703L
+#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
+#endif
+#ifndef FMT_CONSTEXPR_CHAR_TRAITS
+#  define FMT_CONSTEXPR_CHAR_TRAITS
 #endif
 
 // Check if exceptions are disabled.
@@ -124,7 +151,7 @@
 #endif
 
 #if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
-    (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
+    FMT_GCC_VERSION >= 408 || FMT_MSC_VER >= 1900
 #  define FMT_DETECTED_NOEXCEPT noexcept
 #  define FMT_HAS_CXX11_NOEXCEPT 1
 #else
@@ -149,6 +176,49 @@
 #  define FMT_NORETURN
 #endif
 
+#if __cplusplus == 201103L || __cplusplus == 201402L
+#  if defined(__INTEL_COMPILER) || defined(__PGI)
+#    define FMT_FALLTHROUGH
+#  elif defined(__clang__)
+#    define FMT_FALLTHROUGH [[clang::fallthrough]]
+#  elif FMT_GCC_VERSION >= 700 && \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#    define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#  else
+#    define FMT_FALLTHROUGH
+#  endif
+#elif FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+#ifndef FMT_NODISCARD
+#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#    define FMT_NODISCARD [[nodiscard]]
+#  else
+#    define FMT_NODISCARD
+#  endif
+#endif
+
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
+#endif
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+#ifndef FMT_INLINE
+#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#    define FMT_INLINE inline __attribute__((always_inline))
+#  else
+#    define FMT_INLINE inline
+#  endif
+#endif
+
 #ifndef FMT_DEPRECATED
 #  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VER >= 1900
 #    define FMT_DEPRECATED [[deprecated]]
@@ -163,82 +233,47 @@
 #  endif
 #endif
 
-// Workaround broken [[deprecated]] in the Intel, PGI and NVCC compilers.
-#if FMT_ICC_VERSION || defined(__PGI) || FMT_NVCC
-#  define FMT_DEPRECATED_ALIAS
-#else
-#  define FMT_DEPRECATED_ALIAS FMT_DEPRECATED
-#endif
-
-#ifndef FMT_INLINE
-#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#    define FMT_INLINE inline __attribute__((always_inline))
-#  else
-#    define FMT_INLINE inline
-#  endif
-#endif
-
-#ifndef FMT_USE_INLINE_NAMESPACES
-#  if FMT_HAS_FEATURE(cxx_inline_namespaces) || FMT_GCC_VERSION >= 404 || \
-      (FMT_MSC_VER >= 1900 && !_MANAGED)
-#    define FMT_USE_INLINE_NAMESPACES 1
-#  else
-#    define FMT_USE_INLINE_NAMESPACES 0
-#  endif
-#endif
-
 // LAMMPS customization
-// use 'v7_lmp' namespace instead of 'v7' so that our
+// use 'v8_lmp' namespace instead of 'v8' so that our
 // bundled copy does not collide with linking other code
 // using system wide installations which may be using
 // a different version.
 
 #ifndef FMT_BEGIN_NAMESPACE
-#  if FMT_USE_INLINE_NAMESPACES
-#    define FMT_INLINE_NAMESPACE inline namespace
-#    define FMT_END_NAMESPACE \
-      }                       \
-      }
-#  else
-#    define FMT_INLINE_NAMESPACE namespace
-#    define FMT_END_NAMESPACE \
-      }                       \
-      using namespace v7_lmp;     \
-      }
-#  endif
 #  define FMT_BEGIN_NAMESPACE \
     namespace fmt {           \
-    FMT_INLINE_NAMESPACE v7_lmp {
+    inline namespace v8_lmp {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_MODULE_EXPORT
+#  define FMT_MODULE_EXPORT
+#  define FMT_MODULE_EXPORT_BEGIN
+#  define FMT_MODULE_EXPORT_END
+#  define FMT_BEGIN_DETAIL_NAMESPACE namespace detail {
+#  define FMT_END_DETAIL_NAMESPACE }
 #endif
 
 #if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-#  define FMT_CLASS_API FMT_SUPPRESS_MSC_WARNING(4275)
+#  define FMT_CLASS_API FMT_MSC_WARNING(suppress : 4275)
 #  ifdef FMT_EXPORT
 #    define FMT_API __declspec(dllexport)
-#    define FMT_EXTERN_TEMPLATE_API FMT_API
-#    define FMT_EXPORTED
 #  elif defined(FMT_SHARED)
 #    define FMT_API __declspec(dllimport)
-#    define FMT_EXTERN_TEMPLATE_API FMT_API
 #  endif
 #else
 #  define FMT_CLASS_API
+#  if defined(FMT_EXPORT) || defined(FMT_SHARED)
+#    if defined(__GNUC__) || defined(__clang__)
+#      define FMT_API __attribute__((visibility("default")))
+#    endif
+#  endif
 #endif
 #ifndef FMT_API
 #  define FMT_API
 #endif
-#ifndef FMT_EXTERN_TEMPLATE_API
-#  define FMT_EXTERN_TEMPLATE_API
-#endif
-#ifndef FMT_INSTANTIATION_DEF_API
-#  define FMT_INSTANTIATION_DEF_API FMT_API
-#endif
-
-#ifndef FMT_HEADER_ONLY
-#  define FMT_EXTERN extern
-#else
-#  define FMT_EXTERN
-#endif
 
 // libc++ supports string_view in pre-c++17.
 #if (FMT_HAS_INCLUDE(<string_view>) &&                       \
@@ -254,16 +289,43 @@
 #ifndef FMT_UNICODE
 #  define FMT_UNICODE !FMT_MSC_VER
 #endif
-#if FMT_UNICODE && FMT_MSC_VER
-#  pragma execution_character_set("utf-8")
+
+#ifndef FMT_CONSTEVAL
+#  if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) &&      \
+       __cplusplus > 201703L && !defined(__apple_build_version__)) || \
+      (defined(__cpp_consteval) &&                                    \
+       (!FMT_MSC_VER || _MSC_FULL_VER >= 193030704))
+// consteval is broken in MSVC before VS2022 and Apple clang 13.
+#    define FMT_CONSTEVAL consteval
+#    define FMT_HAS_CONSTEVAL
+#  else
+#    define FMT_CONSTEVAL
+#  endif
+#endif
+
+#ifndef FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+#  if defined(__cpp_nontype_template_args) &&                \
+      ((FMT_GCC_VERSION >= 903 && __cplusplus >= 201709L) || \
+       __cpp_nontype_template_args >= 201911L)
+#    define FMT_USE_NONTYPE_TEMPLATE_PARAMETERS 1
+#  else
+#    define FMT_USE_NONTYPE_TEMPLATE_PARAMETERS 0
+#  endif
+#endif
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_GCC_PRAGMA("GCC push_options")
+#ifndef __OPTIMIZE__
+FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
 #endif
 
 FMT_BEGIN_NAMESPACE
+FMT_MODULE_EXPORT_BEGIN
 
 // Implementations of enable_if_t and other metafunctions for older systems.
-template <bool B, class T = void>
+template <bool B, typename T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, class T, class F>
+template <bool B, typename T, typename F>
 using conditional_t = typename std::conditional<B, T, F>::type;
 template <bool B> using bool_constant = std::integral_constant<bool, B>;
 template <typename T>
@@ -275,17 +337,40 @@ using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
 template <typename T> struct type_identity { using type = T; };
 template <typename T> using type_identity_t = typename type_identity<T>::type;
 
-struct monostate {};
+struct monostate {
+  constexpr monostate() {}
+};
 
 // An enable_if helper to be used in template parameters which results in much
 // shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
 // to workaround a bug in MSVC 2019 (see #1140 and #1186).
-#define FMT_ENABLE_IF(...) enable_if_t<(__VA_ARGS__), int> = 0
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) enable_if_t<(__VA_ARGS__), int> = 0
+#endif
 
-namespace detail {
+FMT_BEGIN_DETAIL_NAMESPACE
 
-// A helper function to suppress "conditional expression is constant" warnings.
-template <typename T> constexpr T const_check(T value) { return value; }
+// Suppress "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr FMT_INLINE auto is_constant_evaluated(bool default_value = false)
+    FMT_NOEXCEPT -> bool {
+#ifdef __cpp_lib_is_constant_evaluated
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// A function to suppress "conditional expression is constant" warnings.
+template <typename T> constexpr FMT_INLINE auto const_check(T value) -> T {
+  return value;
+}
 
 FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
                                       const char* message);
@@ -293,7 +378,8 @@ FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
 #ifndef FMT_ASSERT
 #  ifdef NDEBUG
 // FMT_ASSERT is not empty to avoid -Werror=empty-body.
-#    define FMT_ASSERT(condition, message) ((void)0)
+#    define FMT_ASSERT(condition, message) \
+      ::fmt::detail::ignore_unused((condition), (message))
 #  else
 #    define FMT_ASSERT(condition, message)                                    \
       ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
@@ -302,6 +388,12 @@ FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
 #  endif
 #endif
 
+#ifdef __cpp_lib_byte
+using byte = std::byte;
+#else
+enum class byte : unsigned char {};
+#endif
+
 #if defined(FMT_USE_STRING_VIEW)
 template <typename Char> using std_string_view = std::basic_string_view<Char>;
 #elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
@@ -318,38 +410,39 @@ template <typename T> struct std_string_view {};
 #  define FMT_USE_INT128 1
 using int128_t = __int128_t;
 using uint128_t = __uint128_t;
+template <typename T> inline auto convert_for_visit(T value) -> T {
+  return value;
+}
 #else
 #  define FMT_USE_INT128 0
 #endif
 #if !FMT_USE_INT128
-struct int128_t {};
-struct uint128_t {};
+enum class int128_t {};
+enum class uint128_t {};
+// Reduce template instantiations.
+template <typename T> inline auto convert_for_visit(T) -> monostate {
+  return {};
+}
 #endif
 
 // Casts a nonnegative integer to unsigned.
 template <typename Int>
-FMT_CONSTEXPR typename std::make_unsigned<Int>::type to_unsigned(Int value) {
+FMT_CONSTEXPR auto to_unsigned(Int value) ->
+    typename std::make_unsigned<Int>::type {
   FMT_ASSERT(value >= 0, "negative value");
   return static_cast<typename std::make_unsigned<Int>::type>(value);
 }
 
-FMT_SUPPRESS_MSC_WARNING(4566) constexpr unsigned char micro[] = "\u00B5";
+FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char micro[] = "\u00B5";
 
-template <typename Char> constexpr bool is_unicode() {
-  return FMT_UNICODE || sizeof(Char) != 1 ||
-         (sizeof(micro) == 3 && micro[0] == 0xC2 && micro[1] == 0xB5);
+constexpr auto is_utf8() -> bool {
+  // Avoid buggy sign extensions in MSVC's constant evaluation mode.
+  // https://developercommunity.visualstudio.com/t/C-difference-in-behavior-for-unsigned/1233612
+  using uchar = unsigned char;
+  return FMT_UNICODE || (sizeof(micro) == 3 && uchar(micro[0]) == 0xC2 &&
+                         uchar(micro[1]) == 0xB5);
 }
-
-#ifdef __cpp_char8_t
-using char8_type = char8_t;
-#else
-enum char8_type : unsigned char {};
-#endif
-}  // namespace detail
-
-#ifdef FMT_USE_INTERNAL
-namespace internal = detail;  // DEPRECATED
-#endif
+FMT_END_DETAIL_NAMESPACE
 
 /**
   An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
@@ -380,11 +473,14 @@ template <typename Char> class basic_string_view {
     the size with ``std::char_traits<Char>::length``.
     \endrst
    */
-#if __cplusplus >= 201703L  // C++17's char_traits::length() is constexpr.
-  FMT_CONSTEXPR
-#endif
+  FMT_CONSTEXPR_CHAR_TRAITS
+  FMT_INLINE
   basic_string_view(const Char* s)
-      : data_(s), size_(std::char_traits<Char>::length(s)) {}
+      : data_(s),
+        size_(detail::const_check(std::is_same<Char, char>::value &&
+                                  !detail::is_constant_evaluated(true))
+                  ? std::strlen(reinterpret_cast<const char*>(s))
+                  : std::char_traits<Char>::length(s)) {}
 
   /** Constructs a string reference from a ``std::basic_string`` object. */
   template <typename Traits, typename Alloc>
@@ -399,23 +495,25 @@ template <typename Char> class basic_string_view {
                                                       size_(s.size()) {}
 
   /** Returns a pointer to the string data. */
-  constexpr const Char* data() const { return data_; }
+  constexpr auto data() const FMT_NOEXCEPT -> const Char* { return data_; }
 
   /** Returns the string size. */
-  constexpr size_t size() const { return size_; }
+  constexpr auto size() const FMT_NOEXCEPT -> size_t { return size_; }
 
-  constexpr iterator begin() const { return data_; }
-  constexpr iterator end() const { return data_ + size_; }
+  constexpr auto begin() const FMT_NOEXCEPT -> iterator { return data_; }
+  constexpr auto end() const FMT_NOEXCEPT -> iterator { return data_ + size_; }
 
-  constexpr const Char& operator[](size_t pos) const { return data_[pos]; }
+  constexpr auto operator[](size_t pos) const FMT_NOEXCEPT -> const Char& {
+    return data_[pos];
+  }
 
-  FMT_CONSTEXPR void remove_prefix(size_t n) {
+  FMT_CONSTEXPR void remove_prefix(size_t n) FMT_NOEXCEPT {
     data_ += n;
     size_ -= n;
   }
 
   // Lexicographically compare this string reference to other.
-  int compare(basic_string_view other) const {
+  FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const -> int {
     size_t str_size = size_ < other.size_ ? size_ : other.size_;
     int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
     if (result == 0)
@@ -423,72 +521,53 @@ template <typename Char> class basic_string_view {
     return result;
   }
 
-  friend bool operator==(basic_string_view lhs, basic_string_view rhs) {
+  FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(basic_string_view lhs,
+                                                   basic_string_view rhs)
+      -> bool {
     return lhs.compare(rhs) == 0;
   }
-  friend bool operator!=(basic_string_view lhs, basic_string_view rhs) {
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
     return lhs.compare(rhs) != 0;
   }
-  friend bool operator<(basic_string_view lhs, basic_string_view rhs) {
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
     return lhs.compare(rhs) < 0;
   }
-  friend bool operator<=(basic_string_view lhs, basic_string_view rhs) {
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
     return lhs.compare(rhs) <= 0;
   }
-  friend bool operator>(basic_string_view lhs, basic_string_view rhs) {
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
     return lhs.compare(rhs) > 0;
   }
-  friend bool operator>=(basic_string_view lhs, basic_string_view rhs) {
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
     return lhs.compare(rhs) >= 0;
   }
 };
 
 using string_view = basic_string_view<char>;
-using wstring_view = basic_string_view<wchar_t>;
 
 /** Specifies if ``T`` is a character type. Can be specialized by users. */
 template <typename T> struct is_char : std::false_type {};
 template <> struct is_char<char> : std::true_type {};
-template <> struct is_char<wchar_t> : std::true_type {};
-template <> struct is_char<detail::char8_type> : std::true_type {};
-template <> struct is_char<char16_t> : std::true_type {};
-template <> struct is_char<char32_t> : std::true_type {};
 
-/**
-  \rst
-  Returns a string view of `s`. In order to add custom string type support to
-  {fmt} provide an overload of `to_string_view` for it in the same namespace as
-  the type for the argument-dependent lookup to work.
-
-  **Example**::
-
-    namespace my_ns {
-    inline string_view to_string_view(const my_string& s) {
-      return {s.data(), s.length()};
-    }
-    }
-    std::string message = fmt::format(my_string("The answer is {}"), 42);
-  \endrst
- */
+// Returns a string view of `s`.
 template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
-inline basic_string_view<Char> to_string_view(const Char* s) {
+FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view<Char> {
   return s;
 }
-
 template <typename Char, typename Traits, typename Alloc>
-inline basic_string_view<Char> to_string_view(
-    const std::basic_string<Char, Traits, Alloc>& s) {
+inline auto to_string_view(const std::basic_string<Char, Traits, Alloc>& s)
+    -> basic_string_view<Char> {
   return s;
 }
-
 template <typename Char>
-inline basic_string_view<Char> to_string_view(basic_string_view<Char> s) {
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
   return s;
 }
-
 template <typename Char,
           FMT_ENABLE_IF(!std::is_empty<detail::std_string_view<Char>>::value)>
-inline basic_string_view<Char> to_string_view(detail::std_string_view<Char> s) {
+inline auto to_string_view(detail::std_string_view<Char> s)
+    -> basic_string_view<Char> {
   return s;
 }
 
@@ -500,15 +579,15 @@ template <typename S>
 struct is_compile_string : std::is_base_of<compile_string, S> {};
 
 template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-constexpr basic_string_view<typename S::char_type> to_string_view(const S& s) {
-  return s;
+constexpr auto to_string_view(const S& s)
+    -> basic_string_view<typename S::char_type> {
+  return basic_string_view<typename S::char_type>(s);
 }
 
-// LAMMPS customization using 'v7_lmp' instead of 'v7'
+FMT_BEGIN_DETAIL_NAMESPACE
 
-namespace detail {
 void to_string_view(...);
-using fmt::v7_lmp::to_string_view;
+using fmt::to_string_view;
 
 // Specifies whether S is a string type convertible to fmt::basic_string_view.
 // It should be a constexpr function but MSVC 2017 fails to compile it in
@@ -535,6 +614,8 @@ FMT_INLINE void check_format_string(const S&) {
 template <typename..., typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
 void check_format_string(S);
 
+FMT_NORETURN FMT_API void throw_format_error(const char* message);
+
 struct error_handler {
   constexpr error_handler() = default;
   constexpr error_handler(const error_handler&) = default;
@@ -542,7 +623,7 @@ struct error_handler {
   // This function is intentionally not constexpr to give a compile-time error.
   FMT_NORETURN FMT_API void on_error(const char* message);
 };
-}  // namespace detail
+FMT_END_DETAIL_NAMESPACE
 
 /** String's character type. */
 template <typename S> using char_t = typename detail::char_t_impl<S>::type;
@@ -551,16 +632,7 @@ template <typename S> using char_t = typename detail::char_t_impl<S>::type;
   \rst
   Parsing context consisting of a format string range being parsed and an
   argument counter for automatic indexing.
-
-  You can use one of the following type aliases for common character types:
-
-  +-----------------------+-------------------------------------+
-  | Type                  | Definition                          |
-  +=======================+=====================================+
-  | format_parse_context  | basic_format_parse_context<char>    |
-  +-----------------------+-------------------------------------+
-  | wformat_parse_context | basic_format_parse_context<wchar_t> |
-  +-----------------------+-------------------------------------+
+  You can use the ``format_parse_context`` type alias for ``char`` instead.
   \endrst
  */
 template <typename Char, typename ErrorHandler = detail::error_handler>
@@ -582,12 +654,16 @@ class basic_format_parse_context : private ErrorHandler {
     Returns an iterator to the beginning of the format string range being
     parsed.
    */
-  constexpr iterator begin() const FMT_NOEXCEPT { return format_str_.begin(); }
+  constexpr auto begin() const FMT_NOEXCEPT -> iterator {
+    return format_str_.begin();
+  }
 
   /**
     Returns an iterator past the end of the format string range being parsed.
    */
-  constexpr iterator end() const FMT_NOEXCEPT { return format_str_.end(); }
+  constexpr auto end() const FMT_NOEXCEPT -> iterator {
+    return format_str_.end();
+  }
 
   /** Advances the begin iterator to ``it``. */
   FMT_CONSTEXPR void advance_to(iterator it) {
@@ -598,7 +674,7 @@ class basic_format_parse_context : private ErrorHandler {
     Reports an error if using the manual argument indexing; otherwise returns
     the next argument index and switches to the automatic indexing.
    */
-  FMT_CONSTEXPR int next_arg_id() {
+  FMT_CONSTEXPR auto next_arg_id() -> int {
     // Don't check if the argument id is valid to avoid overhead and because it
     // will be checked during formatting anyway.
     if (next_arg_id_ >= 0) return next_arg_id_++;
@@ -623,11 +699,10 @@ class basic_format_parse_context : private ErrorHandler {
     ErrorHandler::on_error(message);
   }
 
-  constexpr ErrorHandler error_handler() const { return *this; }
+  constexpr auto error_handler() const -> ErrorHandler { return *this; }
 };
 
 using format_parse_context = basic_format_parse_context<char>;
-using wformat_parse_context = basic_format_parse_context<wchar_t>;
 
 template <typename Context> class basic_format_arg;
 template <typename Context> class basic_format_args;
@@ -651,11 +726,30 @@ template <typename T> struct is_contiguous : std::false_type {};
 template <typename Char>
 struct is_contiguous<std::basic_string<Char>> : std::true_type {};
 
-namespace detail {
+class appender;
+
+FMT_BEGIN_DETAIL_NAMESPACE
+
+template <typename Context, typename T>
+constexpr auto has_const_formatter_impl(T*)
+    -> decltype(typename Context::template formatter_type<T>().format(
+                    std::declval<const T&>(), std::declval<Context&>()),
+                true) {
+  return true;
+}
+template <typename Context>
+constexpr auto has_const_formatter_impl(...) -> bool {
+  return false;
+}
+template <typename T, typename Context>
+constexpr auto has_const_formatter() -> bool {
+  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
+}
 
 // Extracts a reference to the container from back_insert_iterator.
 template <typename Container>
-inline Container& get_container(std::back_insert_iterator<Container> it) {
+inline auto get_container(std::back_insert_iterator<Container> it)
+    -> Container& {
   using bi_iterator = std::back_insert_iterator<Container>;
   struct accessor : bi_iterator {
     accessor(bi_iterator iter) : bi_iterator(iter) {}
@@ -664,6 +758,23 @@ inline Container& get_container(std::back_insert_iterator<Container> it) {
   return *accessor(it).container;
 }
 
+template <typename Char, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  while (begin != end) *out++ = static_cast<Char>(*begin++);
+  return out;
+}
+
+template <typename Char, typename T, typename U,
+          FMT_ENABLE_IF(
+              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
+FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
+  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
+  auto size = to_unsigned(end - begin);
+  memcpy(out, begin, size * sizeof(U));
+  return out + size;
+}
+
 /**
   \rst
   A contiguous memory buffer with an optional growing ability. It is an internal
@@ -678,24 +789,25 @@ template <typename T> class buffer {
 
  protected:
   // Don't initialize ptr_ since it is not accessed to save a few cycles.
-  FMT_SUPPRESS_MSC_WARNING(26495)
+  FMT_MSC_WARNING(suppress : 26495)
   buffer(size_t sz) FMT_NOEXCEPT : size_(sz), capacity_(sz) {}
 
-  buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) FMT_NOEXCEPT
-      : ptr_(p),
-        size_(sz),
-        capacity_(cap) {}
+  FMT_CONSTEXPR20 buffer(T* p = nullptr, size_t sz = 0,
+                         size_t cap = 0) FMT_NOEXCEPT : ptr_(p),
+                                                        size_(sz),
+                                                        capacity_(cap) {}
 
-  ~buffer() = default;
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
 
   /** Sets the buffer data and capacity. */
-  void set(T* buf_data, size_t buf_capacity) FMT_NOEXCEPT {
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) FMT_NOEXCEPT {
     ptr_ = buf_data;
     capacity_ = buf_capacity;
   }
 
   /** Increases the buffer capacity to hold at least *capacity* elements. */
-  virtual void grow(size_t capacity) = 0;
+  virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0;
 
  public:
   using value_type = T;
@@ -704,30 +816,30 @@ template <typename T> class buffer {
   buffer(const buffer&) = delete;
   void operator=(const buffer&) = delete;
 
-  T* begin() FMT_NOEXCEPT { return ptr_; }
-  T* end() FMT_NOEXCEPT { return ptr_ + size_; }
+  auto begin() FMT_NOEXCEPT -> T* { return ptr_; }
+  auto end() FMT_NOEXCEPT -> T* { return ptr_ + size_; }
 
-  const T* begin() const FMT_NOEXCEPT { return ptr_; }
-  const T* end() const FMT_NOEXCEPT { return ptr_ + size_; }
+  auto begin() const FMT_NOEXCEPT -> const T* { return ptr_; }
+  auto end() const FMT_NOEXCEPT -> const T* { return ptr_ + size_; }
 
   /** Returns the size of this buffer. */
-  size_t size() const FMT_NOEXCEPT { return size_; }
+  constexpr auto size() const FMT_NOEXCEPT -> size_t { return size_; }
 
   /** Returns the capacity of this buffer. */
-  size_t capacity() const FMT_NOEXCEPT { return capacity_; }
+  constexpr auto capacity() const FMT_NOEXCEPT -> size_t { return capacity_; }
 
   /** Returns a pointer to the buffer data. */
-  T* data() FMT_NOEXCEPT { return ptr_; }
+  FMT_CONSTEXPR auto data() FMT_NOEXCEPT -> T* { return ptr_; }
 
   /** Returns a pointer to the buffer data. */
-  const T* data() const FMT_NOEXCEPT { return ptr_; }
+  FMT_CONSTEXPR auto data() const FMT_NOEXCEPT -> const T* { return ptr_; }
 
   /** Clears this buffer. */
   void clear() { size_ = 0; }
 
   // Tries resizing the buffer to contain *count* elements. If T is a POD type
   // the new elements may not be initialized.
-  void try_resize(size_t count) {
+  FMT_CONSTEXPR20 void try_resize(size_t count) {
     try_reserve(count);
     size_ = count <= capacity_ ? count : capacity_;
   }
@@ -736,11 +848,11 @@ template <typename T> class buffer {
   // capacity by a smaller amount than requested but guarantees there is space
   // for at least one additional element either by increasing the capacity or by
   // flushing the buffer if it is full.
-  void try_reserve(size_t new_capacity) {
+  FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
     if (new_capacity > capacity_) grow(new_capacity);
   }
 
-  void push_back(const T& value) {
+  FMT_CONSTEXPR20 void push_back(const T& value) {
     try_reserve(size_ + 1);
     ptr_[size_++] = value;
   }
@@ -748,16 +860,19 @@ template <typename T> class buffer {
   /** Appends data to the end of the buffer. */
   template <typename U> void append(const U* begin, const U* end);
 
-  template <typename I> T& operator[](I index) { return ptr_[index]; }
-  template <typename I> const T& operator[](I index) const {
+  template <typename I> FMT_CONSTEXPR auto operator[](I index) -> T& {
+    return ptr_[index];
+  }
+  template <typename I>
+  FMT_CONSTEXPR auto operator[](I index) const -> const T& {
     return ptr_[index];
   }
 };
 
 struct buffer_traits {
   explicit buffer_traits(size_t) {}
-  size_t count() const { return 0; }
-  size_t limit(size_t size) { return size; }
+  auto count() const -> size_t { return 0; }
+  auto limit(size_t size) -> size_t { return size; }
 };
 
 class fixed_buffer_traits {
@@ -767,8 +882,8 @@ class fixed_buffer_traits {
 
  public:
   explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
-  size_t count() const { return count_; }
-  size_t limit(size_t size) {
+  auto count() const -> size_t { return count_; }
+  auto limit(size_t size) -> size_t {
     size_t n = limit_ > count_ ? limit_ - count_ : 0;
     count_ += size;
     return size < n ? size : n;
@@ -784,33 +899,84 @@ class iterator_buffer final : public Traits, public buffer<T> {
   T data_[buffer_size];
 
  protected:
-  void grow(size_t) final FMT_OVERRIDE {
+  FMT_CONSTEXPR20 void grow(size_t) override {
     if (this->size() == buffer_size) flush();
   }
-  void flush();
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    out_ = copy_str<T>(data_, data_ + this->limit(size), out_);
+  }
 
  public:
   explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
-      : Traits(n),
-        buffer<T>(data_, 0, buffer_size),
-        out_(out) {}
+      : Traits(n), buffer<T>(data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : Traits(other), buffer<T>(data_, 0, buffer_size), out_(other.out_) {}
   ~iterator_buffer() { flush(); }
 
-  OutputIt out() {
+  auto out() -> OutputIt {
     flush();
     return out_;
   }
-  size_t count() const { return Traits::count() + this->size(); }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> final
+    : public fixed_buffer_traits,
+      public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t) override {
+    if (this->size() == this->capacity()) flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : fixed_buffer_traits(other),
+        buffer<T>(std::move(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
 };
 
 template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
  protected:
-  void grow(size_t) final FMT_OVERRIDE {}
+  FMT_CONSTEXPR20 void grow(size_t) override {}
 
  public:
   explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
 
-  T* out() { return &*this->end(); }
+  auto out() -> T* { return &*this->end(); }
 };
 
 // A buffer that writes to a container with the contiguous storage.
@@ -823,7 +989,7 @@ class iterator_buffer<std::back_insert_iterator<Container>,
   Container& container_;
 
  protected:
-  void grow(size_t capacity) final FMT_OVERRIDE {
+  FMT_CONSTEXPR20 void grow(size_t capacity) override {
     container_.resize(capacity);
     this->set(&container_[0], capacity);
   }
@@ -833,7 +999,7 @@ class iterator_buffer<std::back_insert_iterator<Container>,
       : buffer<typename Container::value_type>(c.size()), container_(c) {}
   explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
       : iterator_buffer(get_container(out)) {}
-  std::back_insert_iterator<Container> out() {
+  auto out() -> std::back_insert_iterator<Container> {
     return std::back_inserter(container_);
   }
 };
@@ -846,7 +1012,7 @@ template <typename T = char> class counting_buffer final : public buffer<T> {
   size_t count_ = 0;
 
  protected:
-  void grow(size_t) final FMT_OVERRIDE {
+  FMT_CONSTEXPR20 void grow(size_t) override {
     if (this->size() != buffer_size) return;
     count_ += this->size();
     this->clear();
@@ -855,48 +1021,24 @@ template <typename T = char> class counting_buffer final : public buffer<T> {
  public:
   counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
 
-  size_t count() { return count_ + this->size(); }
+  auto count() -> size_t { return count_ + this->size(); }
 };
 
-// An output iterator that appends to the buffer.
-// It is used to reduce symbol sizes for the common case.
 template <typename T>
-class buffer_appender : public std::back_insert_iterator<buffer<T>> {
-  using base = std::back_insert_iterator<buffer<T>>;
+using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
+                                      std::back_insert_iterator<buffer<T>>>;
 
- public:
-  explicit buffer_appender(buffer<T>& buf) : base(buf) {}
-  buffer_appender(base it) : base(it) {}
-
-  buffer_appender& operator++() {
-    base::operator++();
-    return *this;
-  }
-
-  buffer_appender operator++(int) {
-    buffer_appender tmp = *this;
-    ++*this;
-    return tmp;
-  }
-};
-
-// Maps an output iterator into a buffer.
+// Maps an output iterator to a buffer.
 template <typename T, typename OutputIt>
-iterator_buffer<OutputIt, T> get_buffer(OutputIt);
-template <typename T> buffer<T>& get_buffer(buffer_appender<T>);
-
-template <typename OutputIt> OutputIt get_buffer_init(OutputIt out) {
-  return out;
-}
-template <typename T> buffer<T>& get_buffer_init(buffer_appender<T> out) {
-  return get_container(out);
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
 }
 
 template <typename Buffer>
 auto get_iterator(Buffer& buf) -> decltype(buf.out()) {
   return buf.out();
 }
-template <typename T> buffer_appender<T> get_iterator(buffer<T>& buf) {
+template <typename T> auto get_iterator(buffer<T>& buf) -> buffer_appender<T> {
   return buffer_appender<T>(buf);
 }
 
@@ -906,9 +1048,9 @@ struct fallback_formatter {
 };
 
 // Specifies if T has an enabled fallback_formatter specialization.
-template <typename T, typename Context>
+template <typename T, typename Char>
 using has_fallback_formatter =
-    std::is_constructible<fallback_formatter<T, typename Context::char_type>>;
+    std::is_constructible<fallback_formatter<T, Char>>;
 
 struct view {};
 
@@ -933,8 +1075,8 @@ struct arg_data {
   template <typename... U>
   arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {}
   arg_data(const arg_data& other) = delete;
-  const T* args() const { return args_ + 1; }
-  named_arg_info<Char>* named_args() { return named_args_; }
+  auto args() const -> const T* { return args_ + 1; }
+  auto named_args() -> named_arg_info<Char>* { return named_args_; }
 };
 
 template <typename T, typename Char, size_t NUM_ARGS>
@@ -943,45 +1085,55 @@ struct arg_data<T, Char, NUM_ARGS, 0> {
   T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
 
   template <typename... U>
-  FMT_INLINE arg_data(const U&... init) : args_{init...} {}
-  FMT_INLINE const T* args() const { return args_; }
-  FMT_INLINE std::nullptr_t named_args() { return nullptr; }
+  FMT_CONSTEXPR FMT_INLINE arg_data(const U&... init) : args_{init...} {}
+  FMT_CONSTEXPR FMT_INLINE auto args() const -> const T* { return args_; }
+  FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t {
+    return nullptr;
+  }
 };
 
 template <typename Char>
 inline void init_named_args(named_arg_info<Char>*, int, int) {}
 
-template <typename Char, typename T, typename... Tail>
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_statically_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(!is_named_arg<T>::value)>
 void init_named_args(named_arg_info<Char>* named_args, int arg_count,
                      int named_arg_count, const T&, const Tail&... args) {
   init_named_args(named_args, arg_count + 1, named_arg_count, args...);
 }
 
-template <typename Char, typename T, typename... Tail>
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(is_named_arg<T>::value)>
 void init_named_args(named_arg_info<Char>* named_args, int arg_count,
-                     int named_arg_count, const named_arg<Char, T>& arg,
-                     const Tail&... args) {
+                     int named_arg_count, const T& arg, const Tail&... args) {
   named_args[named_arg_count++] = {arg.name, arg_count};
   init_named_args(named_args, arg_count + 1, named_arg_count, args...);
 }
 
 template <typename... Args>
-FMT_INLINE void init_named_args(std::nullptr_t, int, int, const Args&...) {}
+FMT_CONSTEXPR FMT_INLINE void init_named_args(std::nullptr_t, int, int,
+                                              const Args&...) {}
 
-template <typename T> struct is_named_arg : std::false_type {};
-
-template <typename T, typename Char>
-struct is_named_arg<named_arg<Char, T>> : std::true_type {};
-
-template <bool B = false> constexpr size_t count() { return B ? 1 : 0; }
-template <bool B1, bool B2, bool... Tail> constexpr size_t count() {
+template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
   return (B1 ? 1 : 0) + count<B2, Tail...>();
 }
 
-template <typename... Args> constexpr size_t count_named_args() {
+template <typename... Args> constexpr auto count_named_args() -> size_t {
   return count<is_named_arg<Args>::value...>();
 }
 
+template <typename... Args>
+constexpr auto count_statically_named_args() -> size_t {
+  return count<is_statically_named_arg<Args>::value...>();
+}
+
 enum class type {
   none_type,
   // Integer types should go first,
@@ -1037,6 +1189,11 @@ constexpr bool is_arithmetic_type(type t) {
   return t > type::none_type && t <= type::last_numeric_type;
 }
 
+struct unformattable {};
+struct unformattable_char : unformattable {};
+struct unformattable_const : unformattable {};
+struct unformattable_pointer : unformattable {};
+
 template <typename Char> struct string_value {
   const Char* data;
   size_t size;
@@ -1049,8 +1206,8 @@ template <typename Char> struct named_arg_value {
 
 template <typename Context> struct custom_value {
   using parse_context = typename Context::parse_context_type;
-  const void* value;
-  void (*format)(const void* arg, parse_context& parse_ctx, Context& ctx);
+  void* value;
+  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
 };
 
 // A formatting argument value.
@@ -1059,6 +1216,7 @@ template <typename Context> class value {
   using char_type = typename Context::char_type;
 
   union {
+    monostate no_value;
     int int_value;
     unsigned uint_value;
     long long long_long_value;
@@ -1076,19 +1234,23 @@ template <typename Context> class value {
     named_arg_value<char_type> named_args;
   };
 
-  constexpr FMT_INLINE value(int val = 0) : int_value(val) {}
+  constexpr FMT_INLINE value() : no_value() {}
+  constexpr FMT_INLINE value(int val) : int_value(val) {}
   constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
-  FMT_INLINE value(long long val) : long_long_value(val) {}
-  FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
+  constexpr FMT_INLINE value(long long val) : long_long_value(val) {}
+  constexpr FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
   FMT_INLINE value(int128_t val) : int128_value(val) {}
   FMT_INLINE value(uint128_t val) : uint128_value(val) {}
-  FMT_INLINE value(float val) : float_value(val) {}
-  FMT_INLINE value(double val) : double_value(val) {}
+  constexpr FMT_INLINE value(float val) : float_value(val) {}
+  constexpr FMT_INLINE value(double val) : double_value(val) {}
   FMT_INLINE value(long double val) : long_double_value(val) {}
-  FMT_INLINE value(bool val) : bool_value(val) {}
-  FMT_INLINE value(char_type val) : char_value(val) {}
-  FMT_INLINE value(const char_type* val) { string.data = val; }
-  FMT_INLINE value(basic_string_view<char_type> val) {
+  constexpr FMT_INLINE value(bool val) : bool_value(val) {}
+  constexpr FMT_INLINE value(char_type val) : char_value(val) {}
+  FMT_CONSTEXPR FMT_INLINE value(const char_type* val) {
+    string.data = val;
+    if (is_constant_evaluated()) string.size = {};
+  }
+  FMT_CONSTEXPR FMT_INLINE value(basic_string_view<char_type> val) {
     string.data = val.data();
     string.size = val.size();
   }
@@ -1096,31 +1258,39 @@ template <typename Context> class value {
   FMT_INLINE value(const named_arg_info<char_type>* args, size_t size)
       : named_args{args, size} {}
 
-  template <typename T> FMT_INLINE value(const T& val) {
-    custom.value = &val;
+  template <typename T> FMT_CONSTEXPR FMT_INLINE value(T& val) {
+    using value_type = remove_cvref_t<T>;
+    custom.value = const_cast<value_type*>(&val);
     // Get the formatter type through the context to allow different contexts
     // have different extension points, e.g. `formatter<T>` for `format` and
     // `printf_formatter<T>` for `printf`.
     custom.format = format_custom_arg<
-        T, conditional_t<has_formatter<T, Context>::value,
-                         typename Context::template formatter_type<T>,
-                         fallback_formatter<T, char_type>>>;
+        value_type,
+        conditional_t<has_formatter<value_type, Context>::value,
+                      typename Context::template formatter_type<value_type>,
+                      fallback_formatter<value_type, char_type>>>;
   }
+  value(unformattable);
+  value(unformattable_char);
+  value(unformattable_const);
+  value(unformattable_pointer);
 
  private:
   // Formats an argument of a custom type, such as a user-defined class.
   template <typename T, typename Formatter>
-  static void format_custom_arg(const void* arg,
+  static void format_custom_arg(void* arg,
                                 typename Context::parse_context_type& parse_ctx,
                                 Context& ctx) {
-    Formatter f;
+    auto f = Formatter();
     parse_ctx.advance_to(f.parse(parse_ctx));
-    ctx.advance_to(f.format(*static_cast<const T*>(arg), ctx));
+    using qualified_type =
+        conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    ctx.advance_to(f.format(*static_cast<qualified_type*>(arg), ctx));
   }
 };
 
 template <typename Context, typename T>
-FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value);
+FMT_CONSTEXPR auto make_arg(const T& value) -> basic_format_arg<Context>;
 
 // To minimize the number of types we need to deal with, long is translated
 // either to int or to long long depending on its size.
@@ -1128,52 +1298,84 @@ enum { long_short = sizeof(long) == sizeof(int) };
 using long_type = conditional_t<long_short, int, long long>;
 using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
 
-struct unformattable {};
-
 // Maps formatting arguments to core types.
+// arg_mapper reports errors by returning unformattable instead of using
+// static_assert because it's used in the is_formattable trait.
 template <typename Context> struct arg_mapper {
   using char_type = typename Context::char_type;
 
-  FMT_CONSTEXPR int map(signed char val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned char val) { return val; }
-  FMT_CONSTEXPR int map(short val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned short val) { return val; }
-  FMT_CONSTEXPR int map(int val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned val) { return val; }
-  FMT_CONSTEXPR long_type map(long val) { return val; }
-  FMT_CONSTEXPR ulong_type map(unsigned long val) { return val; }
-  FMT_CONSTEXPR long long map(long long val) { return val; }
-  FMT_CONSTEXPR unsigned long long map(unsigned long long val) { return val; }
-  FMT_CONSTEXPR int128_t map(int128_t val) { return val; }
-  FMT_CONSTEXPR uint128_t map(uint128_t val) { return val; }
-  FMT_CONSTEXPR bool map(bool val) { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val)
+      -> unsigned long long {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int128_t val) -> int128_t { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(uint128_t val) -> uint128_t { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; }
 
-  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
-  FMT_CONSTEXPR char_type map(T val) {
-    static_assert(
-        std::is_same<T, char>::value || std::is_same<T, char_type>::value,
-        "mixing character types is disallowed");
+  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
+                                      std::is_same<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type {
+    return val;
+  }
+  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
+#ifdef __cpp_char8_t
+                                     std::is_same<T, char8_t>::value ||
+#endif
+                                     std::is_same<T, char16_t>::value ||
+                                     std::is_same<T, char32_t>::value) &&
+                                        !std::is_same<T, char_type>::value,
+                                    int> = 0>
+  FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char {
+    return {};
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double {
     return val;
   }
 
-  FMT_CONSTEXPR float map(float val) { return val; }
-  FMT_CONSTEXPR double map(double val) { return val; }
-  FMT_CONSTEXPR long double map(long double val) { return val; }
-
-  FMT_CONSTEXPR const char_type* map(char_type* val) { return val; }
-  FMT_CONSTEXPR const char_type* map(const char_type* val) { return val; }
-  template <typename T, FMT_ENABLE_IF(is_string<T>::value)>
-  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
-    static_assert(std::is_same<char_type, char_t<T>>::value,
-                  "mixing character types is disallowed");
+  FMT_CONSTEXPR FMT_INLINE auto map(char_type* val) -> const char_type* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(const char_type* val) -> const char_type* {
+    return val;
+  }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> basic_string_view<char_type> {
     return to_string_view(val);
   }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          !std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T&) -> unformattable_char {
+    return {};
+  }
   template <typename T,
             FMT_ENABLE_IF(
                 std::is_constructible<basic_string_view<char_type>, T>::value &&
                 !is_string<T>::value && !has_formatter<T, Context>::value &&
-                !has_fallback_formatter<T, Context>::value)>
-  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
+                !has_fallback_formatter<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> basic_string_view<char_type> {
     return basic_string_view<char_type>(val);
   }
   template <
@@ -1182,63 +1384,113 @@ template <typename Context> struct arg_mapper {
           std::is_constructible<std_string_view<char_type>, T>::value &&
           !std::is_constructible<basic_string_view<char_type>, T>::value &&
           !is_string<T>::value && !has_formatter<T, Context>::value &&
-          !has_fallback_formatter<T, Context>::value)>
-  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
+          !has_fallback_formatter<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> basic_string_view<char_type> {
     return std_string_view<char_type>(val);
   }
-  FMT_CONSTEXPR const char* map(const signed char* val) {
-    static_assert(std::is_same<char_type, char>::value, "invalid string type");
-    return reinterpret_cast<const char*>(val);
+
+  using cstring_result = conditional_t<std::is_same<char_type, char>::value,
+                                       const char*, unformattable_pointer>;
+
+  FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto map(const signed char* val)
+      -> cstring_result {
+    return map(reinterpret_cast<const char*>(val));
   }
-  FMT_CONSTEXPR const char* map(const unsigned char* val) {
-    static_assert(std::is_same<char_type, char>::value, "invalid string type");
-    return reinterpret_cast<const char*>(val);
+  FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto map(const unsigned char* val)
+      -> cstring_result {
+    return map(reinterpret_cast<const char*>(val));
   }
-  FMT_CONSTEXPR const char* map(signed char* val) {
-    const auto* const_val = val;
-    return map(const_val);
+  FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto map(signed char* val)
+      -> cstring_result {
+    return map(reinterpret_cast<const char*>(val));
   }
-  FMT_CONSTEXPR const char* map(unsigned char* val) {
-    const auto* const_val = val;
-    return map(const_val);
+  FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto map(unsigned char* val)
+      -> cstring_result {
+    return map(reinterpret_cast<const char*>(val));
   }
 
-  FMT_CONSTEXPR const void* map(void* val) { return val; }
-  FMT_CONSTEXPR const void* map(const void* val) { return val; }
-  FMT_CONSTEXPR const void* map(std::nullptr_t val) { return val; }
-  template <typename T> FMT_CONSTEXPR int map(const T*) {
-    // Formatting of arbitrary pointers is disallowed. If you want to output
-    // a pointer cast it to "void *" or "const void *". In particular, this
-    // forbids formatting of "[const] volatile char *" which is printed as bool
-    // by iostreams.
-    static_assert(!sizeof(T), "formatting of non-void pointers is disallowed");
-    return 0;
+  FMT_CONSTEXPR FMT_INLINE auto map(void* val) -> const void* { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(const void* val) -> const void* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void* {
+    return val;
+  }
+
+  // We use SFINAE instead of a const T* parameter to avoid conflicting with
+  // the C array overload.
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_member_pointer<T>::value ||
+          std::is_function<typename std::remove_pointer<T>::type>::value ||
+          (std::is_convertible<const T&, const void*>::value &&
+           !std::is_convertible<const T&, const char_type*>::value))>
+  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
+    return {};
+  }
+
+  template <typename T, std::size_t N,
+            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] {
+    return values;
   }
 
   template <typename T,
-            FMT_ENABLE_IF(std::is_enum<T>::value &&
-                          !has_formatter<T, Context>::value &&
-                          !has_fallback_formatter<T, Context>::value)>
-  FMT_CONSTEXPR auto map(const T& val)
+            FMT_ENABLE_IF(
+                std::is_enum<T>::value&& std::is_convertible<T, int>::value &&
+                !has_formatter<T, Context>::value &&
+                !has_fallback_formatter<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
       -> decltype(std::declval<arg_mapper>().map(
           static_cast<typename std::underlying_type<T>::type>(val))) {
     return map(static_cast<typename std::underlying_type<T>::type>(val));
   }
-  template <typename T,
-            FMT_ENABLE_IF(!is_string<T>::value && !is_char<T>::value &&
-                          (has_formatter<T, Context>::value ||
-                           has_fallback_formatter<T, Context>::value))>
-  FMT_CONSTEXPR const T& map(const T& val) {
+
+  FMT_CONSTEXPR FMT_INLINE auto map(detail::byte val) -> unsigned {
+    return map(static_cast<unsigned char>(val));
+  }
+
+  template <typename T, typename U = remove_cvref_t<T>>
+  struct formattable
+      : bool_constant<has_const_formatter<U, Context>() ||
+                      !std::is_const<remove_reference_t<T>>::value ||
+                      has_fallback_formatter<U, char_type>::value> {};
+
+#if FMT_MSC_VER != 0 && FMT_MSC_VER < 1910
+  // Workaround a bug in MSVC.
+  template <typename T> FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& {
     return val;
   }
+#else
+  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& {
+    return val;
+  }
+  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T&&) -> unformattable_const {
+    return {};
+  }
+#endif
 
-  template <typename T>
-  FMT_CONSTEXPR auto map(const named_arg<char_type, T>& val)
-      -> decltype(std::declval<arg_mapper>().map(val.value)) {
-    return map(val.value);
+  template <typename T, typename U = remove_cvref_t<T>,
+            FMT_ENABLE_IF(!is_string<U>::value && !is_char<U>::value &&
+                          !std::is_array<U>::value &&
+                          (has_formatter<U, Context>::value ||
+                           has_fallback_formatter<U, char_type>::value))>
+  FMT_CONSTEXPR FMT_INLINE auto map(T&& val)
+      -> decltype(this->do_map(std::forward<T>(val))) {
+    return do_map(std::forward<T>(val));
   }
 
-  unformattable map(...) { return {}; }
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg)
+      -> decltype(std::declval<arg_mapper>().map(named_arg.value)) {
+    return map(named_arg.value);
+  }
+
+  auto map(...) -> unformattable { return {}; }
 };
 
 // A type constant after applying arg_mapper<Context>.
@@ -1252,7 +1504,28 @@ enum { packed_arg_bits = 4 };
 enum { max_packed_args = 62 / packed_arg_bits };
 enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
 enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
-}  // namespace detail
+
+FMT_END_DETAIL_NAMESPACE
+
+// An output iterator that appends to a buffer.
+// It is used to reduce symbol sizes for the common case.
+class appender : public std::back_insert_iterator<detail::buffer<char>> {
+  using base = std::back_insert_iterator<detail::buffer<char>>;
+
+  template <typename T>
+  friend auto get_buffer(appender out) -> detail::buffer<char>& {
+    return detail::get_container(out);
+  }
+
+ public:
+  using std::back_insert_iterator<detail::buffer<char>>::back_insert_iterator;
+  appender(base it) FMT_NOEXCEPT : base(it) {}
+  using _Unchecked_type = appender;  // Mark iterator as checked.
+
+  auto operator++() FMT_NOEXCEPT -> appender& { return *this; }
+
+  auto operator++(int) FMT_NOEXCEPT -> appender { return *this; }
+};
 
 // A formatting argument. It is a trivially copyable/constructible type to
 // allow storage in basic_memory_buffer.
@@ -1262,8 +1535,8 @@ template <typename Context> class basic_format_arg {
   detail::type type_;
 
   template <typename ContextType, typename T>
-  friend FMT_CONSTEXPR basic_format_arg<ContextType> detail::make_arg(
-      const T& value);
+  friend FMT_CONSTEXPR auto detail::make_arg(const T& value)
+      -> basic_format_arg<ContextType>;
 
   template <typename Visitor, typename Ctx>
   friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
@@ -1301,10 +1574,12 @@ template <typename Context> class basic_format_arg {
     return type_ != detail::type::none_type;
   }
 
-  detail::type type() const { return type_; }
+  auto type() const -> detail::type { return type_; }
 
-  bool is_integral() const { return detail::is_integral_type(type_); }
-  bool is_arithmetic() const { return detail::is_arithmetic_type(type_); }
+  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
+  auto is_arithmetic() const -> bool {
+    return detail::is_arithmetic_type(type_);
+  }
 };
 
 /**
@@ -1315,9 +1590,8 @@ template <typename Context> class basic_format_arg {
   \endrst
  */
 template <typename Visitor, typename Context>
-FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
+FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
     Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
-  using char_type = typename Context::char_type;
   switch (arg.type_) {
   case detail::type::none_type:
     break;
@@ -1329,16 +1603,10 @@ FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
     return vis(arg.value_.long_long_value);
   case detail::type::ulong_long_type:
     return vis(arg.value_.ulong_long_value);
-#if FMT_USE_INT128
   case detail::type::int128_type:
-    return vis(arg.value_.int128_value);
+    return vis(detail::convert_for_visit(arg.value_.int128_value));
   case detail::type::uint128_type:
-    return vis(arg.value_.uint128_value);
-#else
-  case detail::type::int128_type:
-  case detail::type::uint128_type:
-    break;
-#endif
+    return vis(detail::convert_for_visit(arg.value_.uint128_value));
   case detail::type::bool_type:
     return vis(arg.value_.bool_value);
   case detail::type::char_type:
@@ -1352,8 +1620,8 @@ FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
   case detail::type::cstring_type:
     return vis(arg.value_.string.data);
   case detail::type::string_type:
-    return vis(basic_string_view<char_type>(arg.value_.string.data,
-                                            arg.value_.string.size));
+    using sv = basic_string_view<typename Context::char_type>;
+    return vis(sv(arg.value_.string.data, arg.value_.string.size));
   case detail::type::pointer_type:
     return vis(arg.value_.pointer);
   case detail::type::custom_type:
@@ -1362,14 +1630,22 @@ FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
   return vis(monostate());
 }
 
-template <typename T> struct formattable : std::false_type {};
+FMT_BEGIN_DETAIL_NAMESPACE
 
-namespace detail {
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
+  get_container(out).append(begin, end);
+  return out;
+}
 
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
 // A workaround for gcc 4.8 to make void_t work in a SFINAE context.
 template <typename... Ts> struct void_t_impl { using type = void; };
 template <typename... Ts>
 using void_t = typename detail::void_t_impl<Ts...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
 
 template <typename It, typename T, typename Enable = void>
 struct is_output_iterator : std::false_type {};
@@ -1392,9 +1668,8 @@ struct is_contiguous_back_insert_iterator : std::false_type {};
 template <typename Container>
 struct is_contiguous_back_insert_iterator<std::back_insert_iterator<Container>>
     : is_contiguous<Container> {};
-template <typename Char>
-struct is_contiguous_back_insert_iterator<buffer_appender<Char>>
-    : std::true_type {};
+template <>
+struct is_contiguous_back_insert_iterator<appender> : std::true_type {};
 
 // A type-erased reference to an std::locale to avoid heavy <locale> include.
 class locale_ref {
@@ -1402,97 +1677,72 @@ class locale_ref {
   const void* locale_;  // A type-erased pointer to std::locale.
 
  public:
-  locale_ref() : locale_(nullptr) {}
+  constexpr locale_ref() : locale_(nullptr) {}
   template <typename Locale> explicit locale_ref(const Locale& loc);
 
   explicit operator bool() const FMT_NOEXCEPT { return locale_ != nullptr; }
 
-  template <typename Locale> Locale get() const;
+  template <typename Locale> auto get() const -> Locale;
 };
 
-template <typename> constexpr unsigned long long encode_types() { return 0; }
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
 
 template <typename Context, typename Arg, typename... Args>
-constexpr unsigned long long encode_types() {
+constexpr auto encode_types() -> unsigned long long {
   return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
          (encode_types<Context, Args...>() << packed_arg_bits);
 }
 
 template <typename Context, typename T>
-FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value) {
+FMT_CONSTEXPR auto make_arg(const T& value) -> basic_format_arg<Context> {
   basic_format_arg<Context> arg;
   arg.type_ = mapped_type_constant<T, Context>::value;
   arg.value_ = arg_mapper<Context>().map(value);
   return arg;
 }
 
-template <typename T> int check(unformattable) {
-  static_assert(
-      formattable<T>(),
-      "Cannot format an argument. To make type T formattable provide a "
-      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
-  return 0;
-}
-template <typename T, typename U> inline const U& check(const U& val) {
-  return val;
-}
-
 // The type template parameter is there to avoid an ODR violation when using
 // a fallback formatter in one translation unit and an implicit conversion in
 // another (not recommended).
 template <bool IS_PACKED, typename Context, type, typename T,
           FMT_ENABLE_IF(IS_PACKED)>
-inline value<Context> make_arg(const T& val) {
-  return check<T>(arg_mapper<Context>().map(val));
+FMT_CONSTEXPR FMT_INLINE auto make_arg(T&& val) -> value<Context> {
+  const auto& arg = arg_mapper<Context>().map(std::forward<T>(val));
+
+  constexpr bool formattable_char =
+      !std::is_same<decltype(arg), const unformattable_char&>::value;
+  static_assert(formattable_char, "Mixing character types is disallowed.");
+
+  constexpr bool formattable_const =
+      !std::is_same<decltype(arg), const unformattable_const&>::value;
+  static_assert(formattable_const, "Cannot format a const argument.");
+
+  // Formatting of arbitrary pointers is disallowed. If you want to output
+  // a pointer cast it to "void *" or "const void *". In particular, this
+  // forbids formatting of "[const] volatile char *" which is printed as bool
+  // by iostreams.
+  constexpr bool formattable_pointer =
+      !std::is_same<decltype(arg), const unformattable_pointer&>::value;
+  static_assert(formattable_pointer,
+                "Formatting of non-void pointers is disallowed.");
+
+  constexpr bool formattable =
+      !std::is_same<decltype(arg), const unformattable&>::value;
+  static_assert(
+      formattable,
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return {arg};
 }
 
 template <bool IS_PACKED, typename Context, type, typename T,
           FMT_ENABLE_IF(!IS_PACKED)>
-inline basic_format_arg<Context> make_arg(const T& value) {
+inline auto make_arg(const T& value) -> basic_format_arg<Context> {
   return make_arg<Context>(value);
 }
-
-template <typename T> struct is_reference_wrapper : std::false_type {};
-template <typename T>
-struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
-
-template <typename T> const T& unwrap(const T& v) { return v; }
-template <typename T> const T& unwrap(const std::reference_wrapper<T>& v) {
-  return static_cast<const T&>(v);
-}
-
-class dynamic_arg_list {
-  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
-  // templates it doesn't complain about inability to deduce single translation
-  // unit for placing vtable. So storage_node_base is made a fake template.
-  template <typename = void> struct node {
-    virtual ~node() = default;
-    std::unique_ptr<node<>> next;
-  };
-
-  template <typename T> struct typed_node : node<> {
-    T value;
-
-    template <typename Arg>
-    FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {}
-
-    template <typename Char>
-    FMT_CONSTEXPR typed_node(const basic_string_view<Char>& arg)
-        : value(arg.data(), arg.size()) {}
-  };
-
-  std::unique_ptr<node<>> head_;
-
- public:
-  template <typename T, typename Arg> const T& push(const Arg& arg) {
-    auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
-    auto& value = new_node->value;
-    new_node->next = std::move(head_);
-    head_ = std::move(new_node);
-    return value;
-  }
-};
-}  // namespace detail
+FMT_END_DETAIL_NAMESPACE
 
 // Formatting context.
 template <typename OutputIt, typename Char> class basic_format_context {
@@ -1511,46 +1761,59 @@ template <typename OutputIt, typename Char> class basic_format_context {
   using parse_context_type = basic_format_parse_context<Char>;
   template <typename T> using formatter_type = formatter<T, char_type>;
 
+  basic_format_context(basic_format_context&&) = default;
   basic_format_context(const basic_format_context&) = delete;
   void operator=(const basic_format_context&) = delete;
   /**
    Constructs a ``basic_format_context`` object. References to the arguments are
    stored in the object so make sure they have appropriate lifetimes.
    */
-  basic_format_context(OutputIt out,
-                       basic_format_args<basic_format_context> ctx_args,
-                       detail::locale_ref loc = detail::locale_ref())
+  constexpr basic_format_context(
+      OutputIt out, basic_format_args<basic_format_context> ctx_args,
+      detail::locale_ref loc = detail::locale_ref())
       : out_(out), args_(ctx_args), loc_(loc) {}
 
-  format_arg arg(int id) const { return args_.get(id); }
-  format_arg arg(basic_string_view<char_type> name) { return args_.get(name); }
-  int arg_id(basic_string_view<char_type> name) { return args_.get_id(name); }
-  const basic_format_args<basic_format_context>& args() const { return args_; }
+  constexpr auto arg(int id) const -> format_arg { return args_.get(id); }
+  FMT_CONSTEXPR auto arg(basic_string_view<char_type> name) -> format_arg {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(basic_string_view<char_type> name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const basic_format_args<basic_format_context>& {
+    return args_;
+  }
 
-  detail::error_handler error_handler() { return {}; }
+  FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; }
   void on_error(const char* message) { error_handler().on_error(message); }
 
   // Returns an iterator to the beginning of the output range.
-  iterator out() { return out_; }
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
 
   // Advances the begin iterator to ``it``.
   void advance_to(iterator it) {
     if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
   }
 
-  detail::locale_ref locale() { return loc_; }
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
 };
 
 template <typename Char>
 using buffer_context =
     basic_format_context<detail::buffer_appender<Char>, Char>;
 using format_context = buffer_context<char>;
-using wformat_context = buffer_context<wchar_t>;
 
 // Workaround an alias issue: https://stackoverflow.com/q/62767544/471164.
 #define FMT_BUFFER_CONTEXT(Char) \
   basic_format_context<detail::buffer_appender<Char>, Char>
 
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<
+    !std::is_base_of<detail::unformattable,
+                     decltype(detail::arg_mapper<buffer_context<Char>>().map(
+                         std::declval<T>()))>::value &&
+    !detail::has_fallback_formatter<T, Char>::value>;
+
 /**
   \rst
   An array of references to arguments. It can be implicitly converted into
@@ -1587,14 +1850,16 @@ class format_arg_store
            : 0);
 
  public:
-  format_arg_store(const Args&... args)
+  template <typename... T>
+  FMT_CONSTEXPR FMT_INLINE format_arg_store(T&&... args)
       :
 #if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
         basic_format_args<Context>(*this),
 #endif
         data_{detail::make_arg<
             is_packed, Context,
-            detail::mapped_type_constant<Args, Context>::value>(args)...} {
+            detail::mapped_type_constant<remove_cvref_t<T>, Context>::value>(
+            std::forward<T>(args))...} {
     detail::init_named_args(data_.named_args(), 0, 0, args...);
   }
 };
@@ -1608,36 +1873,16 @@ class format_arg_store
   \endrst
  */
 template <typename Context = format_context, typename... Args>
-inline format_arg_store<Context, Args...> make_format_args(
-    const Args&... args) {
-  return {args...};
+constexpr auto make_format_args(Args&&... args)
+    -> format_arg_store<Context, remove_cvref_t<Args>...> {
+  return {std::forward<Args>(args)...};
 }
 
 /**
   \rst
-  Constructs a `~fmt::format_arg_store` object that contains references
-  to arguments and can be implicitly converted to `~fmt::format_args`.
-  If ``format_str`` is a compile-time string then `make_args_checked` checks
-  its validity at compile time.
-  \endrst
- */
-template <typename... Args, typename S, typename Char = char_t<S>>
-inline auto make_args_checked(const S& format_str,
-                              const remove_reference_t<Args>&... args)
-    -> format_arg_store<buffer_context<Char>, remove_reference_t<Args>...> {
-  static_assert(
-      detail::count<(
-              std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
-              std::is_reference<Args>::value)...>() == 0,
-      "passing views as lvalues is disallowed");
-  detail::check_format_string<Args...>(format_str);
-  return {args...};
-}
-
-/**
-  \rst
-  Returns a named argument to be used in a formatting function. It should only
-  be used in a call to a formatting function.
+  Returns a named argument to be used in a formatting function.
+  It should only be used in a call to a formatting function or
+  `dynamic_format_arg_store::push_back`.
 
   **Example**::
 
@@ -1645,184 +1890,11 @@ inline auto make_args_checked(const S& format_str,
   \endrst
  */
 template <typename Char, typename T>
-inline detail::named_arg<Char, T> arg(const Char* name, const T& arg) {
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
   static_assert(!detail::is_named_arg<T>(), "nested named arguments");
   return {name, arg};
 }
 
-/**
-  \rst
-  A dynamic version of `fmt::format_arg_store`.
-  It's equipped with a storage to potentially temporary objects which lifetimes
-  could be shorter than the format arguments object.
-
-  It can be implicitly converted into `~fmt::basic_format_args` for passing
-  into type-erased formatting functions such as `~fmt::vformat`.
-  \endrst
- */
-template <typename Context>
-class dynamic_format_arg_store
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-    // Workaround a GCC template argument substitution bug.
-    : public basic_format_args<Context>
-#endif
-{
- private:
-  using char_type = typename Context::char_type;
-
-  template <typename T> struct need_copy {
-    static constexpr detail::type mapped_type =
-        detail::mapped_type_constant<T, Context>::value;
-
-    enum {
-      value = !(detail::is_reference_wrapper<T>::value ||
-                std::is_same<T, basic_string_view<char_type>>::value ||
-                std::is_same<T, detail::std_string_view<char_type>>::value ||
-                (mapped_type != detail::type::cstring_type &&
-                 mapped_type != detail::type::string_type &&
-                 mapped_type != detail::type::custom_type))
-    };
-  };
-
-  template <typename T>
-  using stored_type = conditional_t<detail::is_string<T>::value,
-                                    std::basic_string<char_type>, T>;
-
-  // Storage of basic_format_arg must be contiguous.
-  std::vector<basic_format_arg<Context>> data_;
-  std::vector<detail::named_arg_info<char_type>> named_info_;
-
-  // Storage of arguments not fitting into basic_format_arg must grow
-  // without relocation because items in data_ refer to it.
-  detail::dynamic_arg_list dynamic_args_;
-
-  friend class basic_format_args<Context>;
-
-  unsigned long long get_types() const {
-    return detail::is_unpacked_bit | data_.size() |
-           (named_info_.empty()
-                ? 0ULL
-                : static_cast<unsigned long long>(detail::has_named_args_bit));
-  }
-
-  const basic_format_arg<Context>* data() const {
-    return named_info_.empty() ? data_.data() : data_.data() + 1;
-  }
-
-  template <typename T> void emplace_arg(const T& arg) {
-    data_.emplace_back(detail::make_arg<Context>(arg));
-  }
-
-  template <typename T>
-  void emplace_arg(const detail::named_arg<char_type, T>& arg) {
-    if (named_info_.empty()) {
-      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
-      data_.insert(data_.begin(), {zero_ptr, 0});
-    }
-    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
-    auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
-      data->pop_back();
-    };
-    std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
-        guard{&data_, pop_one};
-    named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
-    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
-    guard.release();
-  }
-
- public:
-  /**
-    \rst
-    Adds an argument into the dynamic store for later passing to a formatting
-    function.
-
-    Note that custom types and string types (but not string views) are copied
-    into the store dynamically allocating memory if necessary.
-
-    **Example**::
-
-      fmt::dynamic_format_arg_store<fmt::format_context> store;
-      store.push_back(42);
-      store.push_back("abc");
-      store.push_back(1.5f);
-      std::string result = fmt::vformat("{} and {} and {}", store);
-    \endrst
-  */
-  template <typename T> void push_back(const T& arg) {
-    if (detail::const_check(need_copy<T>::value))
-      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
-    else
-      emplace_arg(detail::unwrap(arg));
-  }
-
-  /**
-    \rst
-    Adds a reference to the argument into the dynamic store for later passing to
-    a formatting function. Supports named arguments wrapped in
-    ``std::reference_wrapper`` via ``std::ref()``/``std::cref()``.
-
-    **Example**::
-
-      fmt::dynamic_format_arg_store<fmt::format_context> store;
-      char str[] = "1234567890";
-      store.push_back(std::cref(str));
-      int a1_val{42};
-      auto a1 = fmt::arg("a1_", a1_val);
-      store.push_back(std::cref(a1));
-
-      // Changing str affects the output but only for string and custom types.
-      str[0] = 'X';
-
-      std::string result = fmt::vformat("{} and {a1_}");
-      assert(result == "X234567890 and 42");
-    \endrst
-  */
-  template <typename T> void push_back(std::reference_wrapper<T> arg) {
-    static_assert(
-        detail::is_named_arg<typename std::remove_cv<T>::type>::value ||
-            need_copy<T>::value,
-        "objects of built-in types and string views are always copied");
-    emplace_arg(arg.get());
-  }
-
-  /**
-    Adds named argument into the dynamic store for later passing to a formatting
-    function. ``std::reference_wrapper`` is supported to avoid copying of the
-    argument.
-  */
-  template <typename T>
-  void push_back(const detail::named_arg<char_type, T>& arg) {
-    const char_type* arg_name =
-        dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
-    if (detail::const_check(need_copy<T>::value)) {
-      emplace_arg(
-          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
-    } else {
-      emplace_arg(fmt::arg(arg_name, arg.value));
-    }
-  }
-
-  /** Erase all elements from the store */
-  void clear() {
-    data_.clear();
-    named_info_.clear();
-    dynamic_args_ = detail::dynamic_arg_list();
-  }
-
-  /**
-    \rst
-    Reserves space to store at least *new_cap* arguments including
-    *new_cap_named* named arguments.
-    \endrst
-  */
-  void reserve(size_t new_cap, size_t new_cap_named) {
-    FMT_ASSERT(new_cap >= new_cap_named,
-               "Set of arguments includes set of named arguments");
-    data_.reserve(new_cap);
-    named_info_.reserve(new_cap_named);
-  }
-};
-
 /**
   \rst
   A view of a collection of formatting arguments. To avoid lifetime issues it
@@ -1854,25 +1926,27 @@ template <typename Context> class basic_format_args {
     const format_arg* args_;
   };
 
-  bool is_packed() const { return (desc_ & detail::is_unpacked_bit) == 0; }
-  bool has_named_args() const {
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  auto has_named_args() const -> bool {
     return (desc_ & detail::has_named_args_bit) != 0;
   }
 
-  detail::type type(int index) const {
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
     int shift = index * detail::packed_arg_bits;
     unsigned int mask = (1 << detail::packed_arg_bits) - 1;
     return static_cast<detail::type>((desc_ >> shift) & mask);
   }
 
-  basic_format_args(unsigned long long desc,
-                    const detail::value<Context>* values)
+  constexpr FMT_INLINE basic_format_args(unsigned long long desc,
+                                         const detail::value<Context>* values)
       : desc_(desc), values_(values) {}
-  basic_format_args(unsigned long long desc, const format_arg* args)
+  constexpr basic_format_args(unsigned long long desc, const format_arg* args)
       : desc_(desc), args_(args) {}
 
  public:
-  basic_format_args() : desc_(0) {}
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
 
   /**
    \rst
@@ -1880,8 +1954,10 @@ template <typename Context> class basic_format_args {
    \endrst
    */
   template <typename... Args>
-  FMT_INLINE basic_format_args(const format_arg_store<Context, Args...>& store)
-      : basic_format_args(store.desc, store.data_.args()) {}
+  constexpr FMT_INLINE basic_format_args(
+      const format_arg_store<Context, Args...>& store)
+      : basic_format_args(format_arg_store<Context, Args...>::desc,
+                          store.data_.args()) {}
 
   /**
    \rst
@@ -1889,7 +1965,8 @@ template <typename Context> class basic_format_args {
    `~fmt::dynamic_format_arg_store`.
    \endrst
    */
-  FMT_INLINE basic_format_args(const dynamic_format_arg_store<Context>& store)
+  constexpr FMT_INLINE basic_format_args(
+      const dynamic_format_arg_store<Context>& store)
       : basic_format_args(store.get_types(), store.data()) {}
 
   /**
@@ -1897,12 +1974,12 @@ template <typename Context> class basic_format_args {
    Constructs a `basic_format_args` object from a dynamic set of arguments.
    \endrst
    */
-  basic_format_args(const format_arg* args, int count)
+  constexpr basic_format_args(const format_arg* args, int count)
       : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count),
                           args) {}
 
   /** Returns the argument with the specified id. */
-  format_arg get(int id) const {
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
     format_arg arg;
     if (!is_packed()) {
       if (id < max_size()) arg = args_[id];
@@ -1915,12 +1992,14 @@ template <typename Context> class basic_format_args {
     return arg;
   }
 
-  template <typename Char> format_arg get(basic_string_view<Char> name) const {
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
     int id = get_id(name);
     return id >= 0 ? get(id) : format_arg();
   }
 
-  template <typename Char> int get_id(basic_string_view<Char> name) const {
+  template <typename Char>
+  auto get_id(basic_string_view<Char> name) const -> int {
     if (!has_named_args()) return -1;
     const auto& named_args =
         (is_packed() ? values_[-1] : args_[-1].value_).named_args;
@@ -1930,87 +2009,1149 @@ template <typename Context> class basic_format_args {
     return -1;
   }
 
-  int max_size() const {
+  auto max_size() const -> int {
     unsigned long long max_packed = detail::max_packed_args;
     return static_cast<int>(is_packed() ? max_packed
                                         : desc_ & ~detail::is_unpacked_bit);
   }
 };
 
-#ifdef FMT_ARM_ABI_COMPATIBILITY
 /** An alias to ``basic_format_args<format_context>``. */
-// Separate types would result in shorter symbols but break ABI compatibility
+// A separate type would result in shorter symbols but break ABI compatibility
 // between clang and gcc on ARM (#1919).
 using format_args = basic_format_args<format_context>;
-using wformat_args = basic_format_args<wformat_context>;
-#else
-// DEPRECATED! These are kept for ABI compatibility.
-// It is a separate type rather than an alias to make symbols readable.
-struct format_args : basic_format_args<format_context> {
-  template <typename... Args>
-  FMT_INLINE format_args(const Args&... args) : basic_format_args(args...) {}
+
+// We cannot use enum classes as bit fields because of a gcc bug
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414.
+namespace align {
+enum type { none, left, right, center, numeric };
+}
+using align_t = align::type;
+namespace sign {
+enum type { none, minus, plus, space };
+}
+using sign_t = sign::type;
+
+FMT_BEGIN_DETAIL_NAMESPACE
+
+// Workaround an array initialization issue in gcc 4.8.
+template <typename Char> struct fill_t {
+ private:
+  enum { max_size = 4 };
+  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
+  unsigned char size_ = 1;
+
+ public:
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    if (size > max_size) return throw_format_error("invalid fill");
+    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
+    size_ = static_cast<unsigned char>(size);
+  }
+
+  constexpr auto size() const -> size_t { return size_; }
+  constexpr auto data() const -> const Char* { return data_; }
+
+  FMT_CONSTEXPR auto operator[](size_t index) -> Char& { return data_[index]; }
+  FMT_CONSTEXPR auto operator[](size_t index) const -> const Char& {
+    return data_[index];
+  }
 };
-struct wformat_args : basic_format_args<wformat_context> {
-  using basic_format_args::basic_format_args;
+FMT_END_DETAIL_NAMESPACE
+
+enum class presentation_type : unsigned char {
+  none,
+  // Integer types should go first,
+  dec,             // 'd'
+  oct,             // 'o'
+  hex_lower,       // 'x'
+  hex_upper,       // 'X'
+  bin_lower,       // 'b'
+  bin_upper,       // 'B'
+  hexfloat_lower,  // 'a'
+  hexfloat_upper,  // 'A'
+  exp_lower,       // 'e'
+  exp_upper,       // 'E'
+  fixed_lower,     // 'f'
+  fixed_upper,     // 'F'
+  general_lower,   // 'g'
+  general_upper,   // 'G'
+  chr,             // 'c'
+  string,          // 's'
+  pointer          // 'p'
 };
+
+// Format specifiers for built-in and string types.
+template <typename Char> struct basic_format_specs {
+  int width;
+  int precision;
+  presentation_type type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool alt : 1;  // Alternate form ('#').
+  bool localized : 1;
+  detail::fill_t<Char> fill;
+
+  constexpr basic_format_specs()
+      : width(0),
+        precision(-1),
+        type(presentation_type::none),
+        align(align::none),
+        sign(sign::none),
+        alt(false),
+        localized(false) {}
+};
+
+using format_specs = basic_format_specs<char>;
+
+FMT_BEGIN_DETAIL_NAMESPACE
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int id = 0) : index{id} {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow re-using the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char>
+struct dynamic_format_specs : basic_format_specs<Char> {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+struct auto_id {};
+
+// A format specifier handler that sets fields in basic_format_specs.
+template <typename Char> class specs_setter {
+ protected:
+  basic_format_specs<Char>& specs_;
+
+ public:
+  explicit FMT_CONSTEXPR specs_setter(basic_format_specs<Char>& specs)
+      : specs_(specs) {}
+
+  FMT_CONSTEXPR specs_setter(const specs_setter& other)
+      : specs_(other.specs_) {}
+
+  FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; }
+  FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
+    specs_.fill = fill;
+  }
+  FMT_CONSTEXPR void on_sign(sign_t s) { specs_.sign = s; }
+  FMT_CONSTEXPR void on_hash() { specs_.alt = true; }
+  FMT_CONSTEXPR void on_localized() { specs_.localized = true; }
+
+  FMT_CONSTEXPR void on_zero() {
+    if (specs_.align == align::none) specs_.align = align::numeric;
+    specs_.fill[0] = Char('0');
+  }
+
+  FMT_CONSTEXPR void on_width(int width) { specs_.width = width; }
+  FMT_CONSTEXPR void on_precision(int precision) {
+    specs_.precision = precision;
+  }
+  FMT_CONSTEXPR void end_precision() {}
+
+  FMT_CONSTEXPR void on_type(presentation_type type) { specs_.type = type; }
+};
+
+// Format spec handler that saves references to arguments representing dynamic
+// width and precision to be resolved at formatting time.
+template <typename ParseContext>
+class dynamic_specs_handler
+    : public specs_setter<typename ParseContext::char_type> {
+ public:
+  using char_type = typename ParseContext::char_type;
+
+  FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs<char_type>& specs,
+                                      ParseContext& ctx)
+      : specs_setter<char_type>(specs), specs_(specs), context_(ctx) {}
+
+  FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other)
+      : specs_setter<char_type>(other),
+        specs_(other.specs_),
+        context_(other.context_) {}
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
+    specs_.width_ref = make_arg_ref(arg_id);
+  }
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
+    specs_.precision_ref = make_arg_ref(arg_id);
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    context_.on_error(message);
+  }
+
+ private:
+  dynamic_format_specs<char_type>& specs_;
+  ParseContext& context_;
+
+  using arg_ref_type = arg_ref<char_type>;
+
+  FMT_CONSTEXPR auto make_arg_ref(int arg_id) -> arg_ref_type {
+    context_.check_arg_id(arg_id);
+    return arg_ref_type(arg_id);
+  }
+
+  FMT_CONSTEXPR auto make_arg_ref(auto_id) -> arg_ref_type {
+    return arg_ref_type(context_.next_arg_id());
+  }
+
+  FMT_CONSTEXPR auto make_arg_ref(basic_string_view<char_type> arg_id)
+      -> arg_ref_type {
+    context_.check_arg_id(arg_id);
+    basic_string_view<char_type> format_str(
+        context_.begin(), to_unsigned(context_.end() - context_.begin()));
+    return arg_ref_type(arg_id);
+  }
+};
+
+template <typename Char> constexpr bool is_ascii_letter(Char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+// Converts a character to ASCII. Returns a number > 127 on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr auto to_ascii(Char value) -> Char {
+  return value;
+}
+template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
+constexpr auto to_ascii(Char value) ->
+    typename std::underlying_type<Char>::type {
+  return value;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto lengths =
+      "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4";
+  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
+
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  return len + !len;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline auto find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) -> bool {
+  out = static_cast<const char*>(
+      std::memchr(first, value, to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  if (num_digits <= std::numeric_limits<int>::digits10)
+    return static_cast<int>(value);
+  // Check for overflow.
+  const unsigned max = to_unsigned((std::numeric_limits<int>::max)());
+  return num_digits == std::numeric_limits<int>::digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+// Parses fill and alignment.
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
+                               Handler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  auto align = align::none;
+  auto p = begin + code_point_length(begin);
+  if (p >= end) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+    case '^':
+      align = align::center;
+      break;
+    default:
+      break;
+    }
+    if (align != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '{')
+          return handler.on_error("invalid fill character '{'"), begin;
+        handler.on_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
+        begin = p + 1;
+      } else
+        ++begin;
+      handler.on_align(align);
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  return begin;
+}
+
+template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
+}
+
+template <typename Char, typename IDHandler>
+FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
+                                   IDHandler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index =
+          parse_nonnegative_int(begin, end, (std::numeric_limits<int>::max)());
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      handler.on_error("invalid format string");
+    else
+      handler(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    handler.on_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9')));
+  handler(basic_string_view<Char>(begin, to_unsigned(it - begin)));
+  return it;
+}
+
+template <typename Char, typename IDHandler>
+FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(const Char* begin, const Char* end,
+                                           IDHandler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
+  handler();
+  return begin;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_width(const Char* begin, const Char* end,
+                               Handler&& handler) -> const Char* {
+  using detail::auto_id;
+  struct width_adapter {
+    Handler& handler;
+
+    FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); }
+    FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); }
+    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+      handler.on_dynamic_width(id);
+    }
+    FMT_CONSTEXPR void on_error(const char* message) {
+      if (message) handler.on_error(message);
+    }
+  };
+
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    int width = parse_nonnegative_int(begin, end, -1);
+    if (width != -1)
+      handler.on_width(width);
+    else
+      handler.on_error("number is too big");
+  } else if (*begin == '{') {
+    ++begin;
+    if (begin != end) begin = parse_arg_id(begin, end, width_adapter{handler});
+    if (begin == end || *begin != '}')
+      return handler.on_error("invalid format string"), begin;
+    ++begin;
+  }
+  return begin;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   Handler&& handler) -> const Char* {
+  using detail::auto_id;
+  struct precision_adapter {
+    Handler& handler;
+
+    FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); }
+    FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); }
+    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+      handler.on_dynamic_precision(id);
+    }
+    FMT_CONSTEXPR void on_error(const char* message) {
+      if (message) handler.on_error(message);
+    }
+  };
+
+  ++begin;
+  auto c = begin != end ? *begin : Char();
+  if ('0' <= c && c <= '9') {
+    auto precision = parse_nonnegative_int(begin, end, -1);
+    if (precision != -1)
+      handler.on_precision(precision);
+    else
+      handler.on_error("number is too big");
+  } else if (c == '{') {
+    ++begin;
+    if (begin != end)
+      begin = parse_arg_id(begin, end, precision_adapter{handler});
+    if (begin == end || *begin++ != '}')
+      return handler.on_error("invalid format string"), begin;
+  } else {
+    return handler.on_error("missing precision specifier"), begin;
+  }
+  handler.end_precision();
+  return begin;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_presentation_type(Char type) -> presentation_type {
+  switch (to_ascii(type)) {
+  case 'd':
+    return presentation_type::dec;
+  case 'o':
+    return presentation_type::oct;
+  case 'x':
+    return presentation_type::hex_lower;
+  case 'X':
+    return presentation_type::hex_upper;
+  case 'b':
+    return presentation_type::bin_lower;
+  case 'B':
+    return presentation_type::bin_upper;
+  case 'a':
+    return presentation_type::hexfloat_lower;
+  case 'A':
+    return presentation_type::hexfloat_upper;
+  case 'e':
+    return presentation_type::exp_lower;
+  case 'E':
+    return presentation_type::exp_upper;
+  case 'f':
+    return presentation_type::fixed_lower;
+  case 'F':
+    return presentation_type::fixed_upper;
+  case 'g':
+    return presentation_type::general_lower;
+  case 'G':
+    return presentation_type::general_upper;
+  case 'c':
+    return presentation_type::chr;
+  case 's':
+    return presentation_type::string;
+  case 'p':
+    return presentation_type::pointer;
+  default:
+    return presentation_type::none;
+  }
+}
+
+// Parses standard format specifiers and sends notifications about parsed
+// components to handler.
+template <typename Char, typename SpecHandler>
+FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(const Char* begin,
+                                                 const Char* end,
+                                                 SpecHandler&& handler)
+    -> const Char* {
+  if (1 < end - begin && begin[1] == '}' && is_ascii_letter(*begin) &&
+      *begin != 'L') {
+    presentation_type type = parse_presentation_type(*begin++);
+    if (type == presentation_type::none)
+      handler.on_error("invalid type specifier");
+    handler.on_type(type);
+    return begin;
+  }
+
+  if (begin == end) return begin;
+
+  begin = parse_align(begin, end, handler);
+  if (begin == end) return begin;
+
+  // Parse sign.
+  switch (to_ascii(*begin)) {
+  case '+':
+    handler.on_sign(sign::plus);
+    ++begin;
+    break;
+  case '-':
+    handler.on_sign(sign::minus);
+    ++begin;
+    break;
+  case ' ':
+    handler.on_sign(sign::space);
+    ++begin;
+    break;
+  default:
+    break;
+  }
+  if (begin == end) return begin;
+
+  if (*begin == '#') {
+    handler.on_hash();
+    if (++begin == end) return begin;
+  }
+
+  // Parse zero flag.
+  if (*begin == '0') {
+    handler.on_zero();
+    if (++begin == end) return begin;
+  }
+
+  begin = parse_width(begin, end, handler);
+  if (begin == end) return begin;
+
+  // Parse precision.
+  if (*begin == '.') {
+    begin = parse_precision(begin, end, handler);
+    if (begin == end) return begin;
+  }
+
+  if (*begin == 'L') {
+    handler.on_localized();
+    ++begin;
+  }
+
+  // Parse type.
+  if (begin != end && *begin != '}') {
+    presentation_type type = parse_presentation_type(*begin++);
+    if (type == presentation_type::none)
+      handler.on_error("invalid type specifier");
+    handler.on_type(type);
+  }
+  return begin;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  struct id_adapter {
+    Handler& handler;
+    int arg_id;
+
+    FMT_CONSTEXPR void operator()() { arg_id = handler.on_arg_id(); }
+    FMT_CONSTEXPR void operator()(int id) { arg_id = handler.on_arg_id(id); }
+    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+      arg_id = handler.on_arg_id(id);
+    }
+    FMT_CONSTEXPR void on_error(const char* message) {
+      if (message) handler.on_error(message);
+    }
+  };
+
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE void parse_format_string(
+    basic_string_view<Char> format_str, Handler&& handler) {
+  // Workaround a name-lookup bug in MSVC's modules implementation.
+  using detail::find;
+
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct writer {
+    FMT_CONSTEXPR void operator()(const Char* pbegin, const Char* pend) {
+      if (pbegin == pend) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(pbegin, pend, Char('}'), p))
+          return handler_.on_text(pbegin, pend);
+        ++p;
+        if (p == pend || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(pbegin, p);
+        pbegin = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write{handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, typename ParseContext>
+FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
+    -> decltype(ctx.begin()) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffer_context<char_type>;
+  using mapped_type = conditional_t<
+      mapped_type_constant<T, context>::value != type::custom_type,
+      decltype(arg_mapper<context>().map(std::declval<const T&>())), T>;
+  auto f = conditional_t<has_formatter<mapped_type, context>::value,
+                         formatter<mapped_type, char_type>,
+                         fallback_formatter<T, char_type>>();
+  return f.parse(ctx);
+}
+
+// A parse context with extra argument id checks. It is only used at compile
+// time because adding checks at runtime would introduce substantial overhead
+// and would be redundant since argument ids are checked when arguments are
+// retrieved anyway.
+template <typename Char, typename ErrorHandler = error_handler>
+class compile_parse_context
+    : public basic_format_parse_context<Char, ErrorHandler> {
+ private:
+  int num_args_;
+  using base = basic_format_parse_context<Char, ErrorHandler>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str,
+      int num_args = (std::numeric_limits<int>::max)(), ErrorHandler eh = {})
+      : base(format_str, eh), num_args_(num_args) {}
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) this->on_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) this->on_error("argument not found");
+  }
+  using base::check_arg_id;
+};
+
+template <typename ErrorHandler>
+FMT_CONSTEXPR void check_int_type_spec(presentation_type type,
+                                       ErrorHandler&& eh) {
+  if (type > presentation_type::bin_upper && type != presentation_type::chr)
+    eh.on_error("invalid type specifier");
+}
+
+// Checks char specs and returns true if the type spec is char (and not int).
+template <typename Char, typename ErrorHandler = error_handler>
+FMT_CONSTEXPR auto check_char_specs(const basic_format_specs<Char>& specs,
+                                    ErrorHandler&& eh = {}) -> bool {
+  if (specs.type != presentation_type::none &&
+      specs.type != presentation_type::chr) {
+    check_int_type_spec(specs.type, eh);
+    return false;
+  }
+  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
+    eh.on_error("invalid format specifier for char");
+  return true;
+}
+
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
+  hex
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool upper : 1;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool fallback : 1;
+  bool showpoint : 1;
+};
+
+template <typename ErrorHandler = error_handler, typename Char>
+FMT_CONSTEXPR auto parse_float_type_spec(const basic_format_specs<Char>& specs,
+                                         ErrorHandler&& eh = {})
+    -> float_specs {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  result.locale = specs.localized;
+  switch (specs.type) {
+  case presentation_type::none:
+    result.format = float_format::general;
+    break;
+  case presentation_type::general_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::general_lower:
+    result.format = float_format::general;
+    break;
+  case presentation_type::exp_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::exp_lower:
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::fixed_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::fixed_lower:
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::hexfloat_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::hexfloat_lower:
+    result.format = float_format::hex;
+    break;
+  default:
+    eh.on_error("invalid type specifier");
+    break;
+  }
+  return result;
+}
+
+template <typename ErrorHandler = error_handler>
+FMT_CONSTEXPR auto check_cstring_type_spec(presentation_type type,
+                                           ErrorHandler&& eh = {}) -> bool {
+  if (type == presentation_type::none || type == presentation_type::string)
+    return true;
+  if (type != presentation_type::pointer) eh.on_error("invalid type specifier");
+  return false;
+}
+
+template <typename ErrorHandler = error_handler>
+FMT_CONSTEXPR void check_string_type_spec(presentation_type type,
+                                          ErrorHandler&& eh = {}) {
+  if (type != presentation_type::none && type != presentation_type::string)
+    eh.on_error("invalid type specifier");
+}
+
+template <typename ErrorHandler>
+FMT_CONSTEXPR void check_pointer_type_spec(presentation_type type,
+                                           ErrorHandler&& eh) {
+  if (type != presentation_type::none && type != presentation_type::pointer)
+    eh.on_error("invalid type specifier");
+}
+
+// A parse_format_specs handler that checks if specifiers are consistent with
+// the argument type.
+template <typename Handler> class specs_checker : public Handler {
+ private:
+  detail::type arg_type_;
+
+  FMT_CONSTEXPR void require_numeric_argument() {
+    if (!is_arithmetic_type(arg_type_))
+      this->on_error("format specifier requires numeric argument");
+  }
+
+ public:
+  FMT_CONSTEXPR specs_checker(const Handler& handler, detail::type arg_type)
+      : Handler(handler), arg_type_(arg_type) {}
+
+  FMT_CONSTEXPR void on_align(align_t align) {
+    if (align == align::numeric) require_numeric_argument();
+    Handler::on_align(align);
+  }
+
+  FMT_CONSTEXPR void on_sign(sign_t s) {
+    require_numeric_argument();
+    if (is_integral_type(arg_type_) && arg_type_ != type::int_type &&
+        arg_type_ != type::long_long_type && arg_type_ != type::char_type) {
+      this->on_error("format specifier requires signed argument");
+    }
+    Handler::on_sign(s);
+  }
+
+  FMT_CONSTEXPR void on_hash() {
+    require_numeric_argument();
+    Handler::on_hash();
+  }
+
+  FMT_CONSTEXPR void on_localized() {
+    require_numeric_argument();
+    Handler::on_localized();
+  }
+
+  FMT_CONSTEXPR void on_zero() {
+    require_numeric_argument();
+    Handler::on_zero();
+  }
+
+  FMT_CONSTEXPR void end_precision() {
+    if (is_integral_type(arg_type_) || arg_type_ == type::pointer_type)
+      this->on_error("precision not allowed for this argument type");
+  }
+};
+
+constexpr int invalid_arg_index = -1;
+
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+template <int N, typename T, typename... Args, typename Char>
+constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+  if constexpr (detail::is_statically_named_arg<T>()) {
+    if (name == T::name) return N;
+  }
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<N + 1, Args...>(name);
+  (void)name;  // Workaround an MSVC bug about "unused" parameter.
+  return invalid_arg_index;
+}
 #endif
 
-namespace detail {
+template <typename... Args, typename Char>
+FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<0, Args...>(name);
+#endif
+  (void)name;
+  return invalid_arg_index;
+}
 
-template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
-std::basic_string<Char> vformat(
-    basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args);
+template <typename Char, typename ErrorHandler, typename... Args>
+class format_string_checker {
+ private:
+  using parse_context_type = compile_parse_context<Char, ErrorHandler>;
+  enum { num_args = sizeof...(Args) };
 
-FMT_API std::string vformat(string_view format_str, format_args args);
+  // Format specifier parsing function.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? num_args : 1];
+
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(
+      basic_string_view<Char> format_str, ErrorHandler eh)
+      : context_(format_str, num_args, eh),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    return context_.check_arg_id(id), id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+    auto index = get_arg_index_by_name<Args...>(id);
+    if (index == invalid_arg_index) on_error("named argument is not found");
+    return context_.check_arg_id(index), index;
+#else
+    (void)id;
+    on_error("compile-time checks for named arguments require C++20 support");
+    return 0;
+#endif
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
+      -> const Char* {
+    context_.advance_to(context_.begin() + (begin - &*context_.begin()));
+    // id >= 0 check is a workaround for gcc 10 bug (#2065).
+    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    context_.on_error(message);
+  }
+};
+
+template <typename... Args, typename S,
+          enable_if_t<(is_compile_string<S>::value), int>>
+void check_format_string(S format_str) {
+  FMT_CONSTEXPR auto s = to_string_view(format_str);
+  using checker = format_string_checker<typename S::char_type, error_handler,
+                                        remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR bool invalid_format =
+      (parse_format_string<true>(s, checker(s, {})), true);
+  ignore_unused(invalid_format);
+}
 
 template <typename Char>
 void vformat_to(
-    buffer<Char>& buf, basic_string_view<Char> format_str,
+    buffer<Char>& buf, basic_string_view<Char> fmt,
     basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args,
-    detail::locale_ref loc = {});
-
-template <typename Char, typename Args,
-          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
-inline void vprint_mojibake(std::FILE*, basic_string_view<Char>, const Args&) {}
+    locale_ref loc = {});
 
 FMT_API void vprint_mojibake(std::FILE*, string_view, format_args);
 #ifndef _WIN32
 inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
 #endif
-}  // namespace detail
+FMT_END_DETAIL_NAMESPACE
+
+// A formatter specialization for the core types corresponding to detail::type
+// constants.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>> {
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+
+ public:
+  // Parses format specifiers stopping either at the end of the range or at the
+  // terminating '}'.
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto begin = ctx.begin(), end = ctx.end();
+    if (begin == end) return begin;
+    using handler_type = detail::dynamic_specs_handler<ParseContext>;
+    auto type = detail::type_constant<T, Char>::value;
+    auto checker =
+        detail::specs_checker<handler_type>(handler_type(specs_, ctx), type);
+    auto it = detail::parse_format_specs(begin, end, checker);
+    auto eh = ctx.error_handler();
+    switch (type) {
+    case detail::type::none_type:
+      FMT_ASSERT(false, "invalid argument type");
+      break;
+    case detail::type::bool_type:
+      if (specs_.type == presentation_type::none ||
+          specs_.type == presentation_type::string) {
+        break;
+      }
+      FMT_FALLTHROUGH;
+    case detail::type::int_type:
+    case detail::type::uint_type:
+    case detail::type::long_long_type:
+    case detail::type::ulong_long_type:
+    case detail::type::int128_type:
+    case detail::type::uint128_type:
+      detail::check_int_type_spec(specs_.type, eh);
+      break;
+    case detail::type::char_type:
+      detail::check_char_specs(specs_, eh);
+      break;
+    case detail::type::float_type:
+      if (detail::const_check(FMT_USE_FLOAT))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "float support disabled");
+      break;
+    case detail::type::double_type:
+      if (detail::const_check(FMT_USE_DOUBLE))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "double support disabled");
+      break;
+    case detail::type::long_double_type:
+      if (detail::const_check(FMT_USE_LONG_DOUBLE))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "long double support disabled");
+      break;
+    case detail::type::cstring_type:
+      detail::check_cstring_type_spec(specs_.type, eh);
+      break;
+    case detail::type::string_type:
+      detail::check_string_type_spec(specs_.type, eh);
+      break;
+    case detail::type::pointer_type:
+      detail::check_pointer_type_spec(specs_.type, eh);
+      break;
+    case detail::type::custom_type:
+      // Custom format specifiers are checked in parse functions of
+      // formatter specializations.
+      break;
+    }
+    return it;
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+
+template <typename Char> struct basic_runtime { basic_string_view<Char> str; };
+
+/** A compile-time format string. */
+template <typename Char, typename... Args> class basic_format_string {
+ private:
+  basic_string_view<Char> str_;
+
+ public:
+  template <typename S,
+            FMT_ENABLE_IF(
+                std::is_convertible<const S&, basic_string_view<Char>>::value)>
+  FMT_CONSTEVAL FMT_INLINE basic_format_string(const S& s) : str_(s) {
+    static_assert(
+        detail::count<
+            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+             std::is_reference<Args>::value)...>() == 0,
+        "passing views as lvalues is disallowed");
+#ifdef FMT_HAS_CONSTEVAL
+    if constexpr (detail::count_named_args<Args...>() ==
+                  detail::count_statically_named_args<Args...>()) {
+      using checker = detail::format_string_checker<Char, detail::error_handler,
+                                                    remove_cvref_t<Args>...>;
+      detail::parse_format_string<true>(str_, checker(s, {}));
+    }
+#else
+    detail::check_format_string<Args...>(s);
+#endif
+  }
+  basic_format_string(basic_runtime<Char> r) : str_(r.str) {}
+
+  FMT_INLINE operator basic_string_view<Char>() const { return str_; }
+};
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename... Args> using format_string = string_view;
+template <typename S> auto runtime(const S& s) -> basic_string_view<char_t<S>> {
+  return s;
+}
+#else
+template <typename... Args>
+using format_string = basic_format_string<char, type_identity_t<Args>...>;
+/**
+  \rst
+  Creates a runtime format string.
+
+  **Example**::
+
+    // Check format string at runtime instead of compile-time.
+    fmt::print(fmt::runtime("{:d}"), "I am not a number");
+  \endrst
+ */
+template <typename S> auto runtime(const S& s) -> basic_runtime<char_t<S>> {
+  return {{s}};
+}
+#endif
+
+FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and returns the result
+  as a string.
+
+  **Example**::
+
+    #include <fmt/core.h>
+    std::string message = fmt::format("The answer is {}.", 42);
+  \endrst
+*/
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(fmt, fmt::make_format_args(args...));
+}
 
 /** Formats a string and writes the output to ``out``. */
-// GCC 8 and earlier cannot handle std::back_insert_iterator<Container> with
-// vformat_to<ArgFormatter>(...) overload, so SFINAE on iterator type instead.
-template <typename OutputIt, typename S, typename Char = char_t<S>,
-          bool enable = detail::is_output_iterator<OutputIt, Char>::value>
-auto vformat_to(OutputIt out, const S& format_str,
-                basic_format_args<buffer_context<type_identity_t<Char>>> args)
-    -> typename std::enable_if<enable, OutputIt>::type {
-  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
-  detail::vformat_to(buf, to_string_view(format_str), args);
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
+  using detail::get_buffer;
+  auto&& buf = get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
   return detail::get_iterator(buf);
 }
 
 /**
  \rst
- Formats arguments, writes the result to the output iterator ``out`` and returns
- the iterator past the end of the output range.
+ Formats ``args`` according to specifications in ``fmt``, writes the result to
+ the output iterator ``out`` and returns the iterator past the end of the output
+ range. `format_to` does not append a terminating null character.
 
  **Example**::
 
-   std::vector<char> out;
+   auto out = std::vector<char>();
    fmt::format_to(std::back_inserter(out), "{}", 42);
  \endrst
  */
-// We cannot use FMT_ENABLE_IF because of a bug in gcc 8.3.
-template <typename OutputIt, typename S, typename... Args,
-          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
-inline auto format_to(OutputIt out, const S& format_str, Args&&... args) ->
-    typename std::enable_if<enable, OutputIt>::type {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return vformat_to(out, to_string_view(format_str), vargs);
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to(OutputIt out, format_string<T...> fmt, T&&... args)
+    -> OutputIt {
+  return vformat_to(out, fmt, fmt::make_format_args(args...));
 }
 
 template <typename OutputIt> struct format_to_n_result {
@@ -2020,111 +3161,82 @@ template <typename OutputIt> struct format_to_n_result {
   size_t size;
 };
 
-template <typename OutputIt, typename Char, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-inline format_to_n_result<OutputIt> vformat_to_n(
-    OutputIt out, size_t n, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  detail::iterator_buffer<OutputIt, Char, detail::fixed_buffer_traits> buf(out,
-                                                                           n);
-  detail::vformat_to(buf, format_str, args);
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
   return {buf.out(), buf.count()};
 }
 
 /**
- \rst
- Formats arguments, writes up to ``n`` characters of the result to the output
- iterator ``out`` and returns the total output size and the iterator past the
- end of the output range.
- \endrst
+  \rst
+  Formats ``args`` according to specifications in ``fmt``, writes up to ``n``
+  characters of the result to the output iterator ``out`` and returns the total
+  (not truncated) output size and the iterator past the end of the output range.
+  `format_to_n` does not append a terminating null character.
+  \endrst
  */
-template <typename OutputIt, typename S, typename... Args,
-          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
-inline auto format_to_n(OutputIt out, size_t n, const S& format_str,
-                        const Args&... args) ->
-    typename std::enable_if<enable, format_to_n_result<OutputIt>>::type {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return vformat_to_n(out, n, to_string_view(format_str), vargs);
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
 }
 
-/**
-  Returns the number of characters in the output of
-  ``format(format_str, args...)``.
- */
-template <typename... Args>
-inline size_t formatted_size(string_view format_str, Args&&... args) {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  detail::counting_buffer<> buf;
-  detail::vformat_to(buf, format_str, vargs);
+/** Returns the number of chars in the output of ``format(fmt, args...)``. */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to(buf, string_view(fmt), fmt::make_format_args(args...), {});
   return buf.count();
 }
 
-template <typename S, typename Char = char_t<S>>
-FMT_INLINE std::basic_string<Char> vformat(
-    const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  return detail::vformat(to_string_view(format_str), args);
-}
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(std::FILE* f, string_view fmt, format_args args);
 
 /**
   \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    #include <fmt/core.h>
-    std::string message = fmt::format("The answer is {}", 42);
-  \endrst
-*/
-// Pass char_t as a default template parameter instead of using
-// std::basic_string<char_t<S>> to reduce the symbol size.
-template <typename S, typename... Args, typename Char = char_t<S>>
-FMT_INLINE std::basic_string<Char> format(const S& format_str, Args&&... args) {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return detail::vformat(to_string_view(format_str), vargs);
-}
-
-FMT_API void vprint(string_view, format_args);
-FMT_API void vprint(std::FILE*, string_view, format_args);
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``format_str`` and writes the
-  output to the file ``f``. Strings are assumed to be Unicode-encoded unless the
-  ``FMT_UNICODE`` macro is set to 0.
-
-  **Example**::
-
-    fmt::print(stderr, "Don't {}!", "panic");
-  \endrst
- */
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline void print(std::FILE* f, const S& format_str, Args&&... args) {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return detail::is_unicode<Char>()
-             ? vprint(f, to_string_view(format_str), vargs)
-             : detail::vprint_mojibake(f, to_string_view(format_str), vargs);
-}
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``format_str`` and writes
-  the output to ``stdout``. Strings are assumed to be Unicode-encoded unless
-  the ``FMT_UNICODE`` macro is set to 0.
+  Formats ``args`` according to specifications in ``fmt`` and writes the output
+  to ``stdout``.
 
   **Example**::
 
     fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
   \endrst
  */
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline void print(const S& format_str, Args&&... args) {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return detail::is_unicode<Char>()
-             ? vprint(to_string_view(format_str), vargs)
-             : detail::vprint_mojibake(stdout, to_string_view(format_str),
-                                       vargs);
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(fmt, vargs)
+                           : detail::vprint_mojibake(stdout, fmt, vargs);
 }
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and writes the
+  output to the file ``f``.
+
+  **Example**::
+
+    fmt::print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+template <typename... T>
+FMT_INLINE void print(std::FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(f, fmt, vargs)
+                           : detail::vprint_mojibake(f, fmt, vargs);
+}
+
+FMT_MODULE_EXPORT_END
+FMT_GCC_PRAGMA("GCC pop_options")
 FMT_END_NAMESPACE
 
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
 #endif  // FMT_CORE_H_
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index 8f2fe7354a..3c0739a1ed 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -8,8 +8,9 @@
 #ifndef FMT_FORMAT_INL_H_
 #define FMT_FORMAT_INL_H_
 
-#include <cassert>
+#include <algorithm>
 #include <cctype>
+#include <cerrno>  // errno
 #include <climits>
 #include <cmath>
 #include <cstdarg>
@@ -27,11 +28,6 @@
 
 #include "format.h"
 
-// Dummy implementations of strerror_r and strerror_s called if corresponding
-// system functions are not available.
-inline fmt::detail::null<> strerror_r(int, char*, ...) { return {}; }
-inline fmt::detail::null<> strerror_s(char*, size_t, ...) { return {}; }
-
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
@@ -44,6 +40,10 @@ FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
   std::terminate();
 }
 
+FMT_FUNC void throw_format_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
 #ifndef _MSC_VER
 #  define FMT_SNPRINTF snprintf
 #else  // _MSC_VER
@@ -57,76 +57,6 @@ inline int fmt_snprintf(char* buffer, size_t size, const char* format, ...) {
 #  define FMT_SNPRINTF fmt_snprintf
 #endif  // _MSC_VER
 
-// A portable thread-safe version of strerror.
-// Sets buffer to point to a string describing the error code.
-// This can be either a pointer to a string stored in buffer,
-// or a pointer to some static immutable string.
-// Returns one of the following values:
-//   0      - success
-//   ERANGE - buffer is not large enough to store the error message
-//   other  - failure
-// Buffer should be at least of size 1.
-inline int safe_strerror(int error_code, char*& buffer,
-                         size_t buffer_size) FMT_NOEXCEPT {
-  FMT_ASSERT(buffer != nullptr && buffer_size != 0, "invalid buffer");
-
-  class dispatcher {
-   private:
-    int error_code_;
-    char*& buffer_;
-    size_t buffer_size_;
-
-    // A noop assignment operator to avoid bogus warnings.
-    void operator=(const dispatcher&) {}
-
-    // Handle the result of XSI-compliant version of strerror_r.
-    int handle(int result) {
-      // glibc versions before 2.13 return result in errno.
-      return result == -1 ? errno : result;
-    }
-
-    // Handle the result of GNU-specific version of strerror_r.
-    FMT_MAYBE_UNUSED
-    int handle(char* message) {
-      // If the buffer is full then the message is probably truncated.
-      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
-        return ERANGE;
-      buffer_ = message;
-      return 0;
-    }
-
-    // Handle the case when strerror_r is not available.
-    FMT_MAYBE_UNUSED
-    int handle(detail::null<>) {
-      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
-    }
-
-    // Fallback to strerror_s when strerror_r is not available.
-    FMT_MAYBE_UNUSED
-    int fallback(int result) {
-      // If the buffer is full then the message is probably truncated.
-      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ? ERANGE
-                                                                : result;
-    }
-
-#if !FMT_MSC_VER
-    // Fallback to strerror if strerror_r and strerror_s are not available.
-    int fallback(detail::null<>) {
-      errno = 0;
-      buffer_ = strerror(error_code_);
-      return errno;
-    }
-#endif
-
-   public:
-    dispatcher(int err_code, char*& buf, size_t buf_size)
-        : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
-
-    int run() { return handle(strerror_r(error_code_, buffer_, buffer_size_)); }
-  };
-  return dispatcher(error_code, buffer, buffer_size).run();
-}
-
 FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
                                 string_view message) FMT_NOEXCEPT {
   // Report error code making sure that the output fits into
@@ -145,18 +75,18 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
   error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
   auto it = buffer_appender<char>(out);
   if (message.size() <= inline_buffer_size - error_code_size)
-    format_to(it, "{}{}", message, SEP);
-  format_to(it, "{}{}", ERROR_STR, error_code);
-  assert(out.size() <= inline_buffer_size);
+    format_to(it, FMT_STRING("{}{}"), message, SEP);
+  format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+  FMT_ASSERT(out.size() <= inline_buffer_size, "");
 }
 
 FMT_FUNC void report_error(format_func func, int error_code,
-                           string_view message) FMT_NOEXCEPT {
+                           const char* message) FMT_NOEXCEPT {
   memory_buffer full_message;
   func(full_message, error_code, message);
   // Don't use fwrite_fully because the latter may throw.
-  (void)std::fwrite(full_message.data(), full_message.size(), 1, stderr);
-  std::fputc('\n', stderr);
+  if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
+    std::fputc('\n', stderr);
 }
 
 // A wrapper around fwrite that throws on error.
@@ -165,11 +95,8 @@ inline void fwrite_fully(const void* ptr, size_t size, size_t count,
   size_t written = std::fwrite(ptr, size, count, stream);
   if (written < count) FMT_THROW(system_error(errno, "cannot write to file"));
 }
-}  // namespace detail
-
-#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
-namespace detail {
 
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 template <typename Locale>
 locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
   static_assert(std::is_same<Locale, std::locale>::value, "");
@@ -180,41 +107,36 @@ template <typename Locale> Locale locale_ref::get() const {
   return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
 }
 
-template <typename Char> FMT_FUNC std::string grouping_impl(locale_ref loc) {
-  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>()).grouping();
-}
-template <typename Char> FMT_FUNC Char thousands_sep_impl(locale_ref loc) {
-  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
-      .thousands_sep();
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
+  auto& facet = std::use_facet<std::numpunct<Char>>(loc.get<std::locale>());
+  auto grouping = facet.grouping();
+  auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
+  return {std::move(grouping), thousands_sep};
 }
 template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref loc) {
   return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
       .decimal_point();
 }
-}  // namespace detail
 #else
 template <typename Char>
-FMT_FUNC std::string detail::grouping_impl(locale_ref) {
-  return "\03";
+FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result<Char> {
+  return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR};
 }
-template <typename Char> FMT_FUNC Char detail::thousands_sep_impl(locale_ref) {
-  return FMT_STATIC_THOUSANDS_SEPARATOR;
-}
-template <typename Char> FMT_FUNC Char detail::decimal_point_impl(locale_ref) {
+template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref) {
   return '.';
 }
 #endif
+}  // namespace detail
 
+#if !FMT_MSC_VER
 FMT_API FMT_FUNC format_error::~format_error() FMT_NOEXCEPT = default;
-FMT_API FMT_FUNC system_error::~system_error() FMT_NOEXCEPT = default;
+#endif
 
-FMT_FUNC void system_error::init(int err_code, string_view format_str,
-                                 format_args args) {
-  error_code_ = err_code;
-  memory_buffer buffer;
-  format_system_error(buffer, err_code, vformat(format_str, args));
-  std::runtime_error& base = *this;
-  base = std::runtime_error(to_string(buffer));
+FMT_FUNC std::system_error vsystem_error(int error_code, string_view format_str,
+                                         format_args args) {
+  auto ec = std::error_code(error_code, std::generic_category());
+  return std::system_error(ec, vformat(format_str, args));
 }
 
 namespace detail {
@@ -227,946 +149,153 @@ template <> FMT_FUNC int count_digits<4>(detail::fallback_uintptr n) {
   return i >= 0 ? i * char_digits + count_digits<4, unsigned>(n.value[i]) : 1;
 }
 
-template <typename T>
-const typename basic_data<T>::digit_pair basic_data<T>::digits[] = {
-    {'0', '0'}, {'0', '1'}, {'0', '2'}, {'0', '3'}, {'0', '4'}, {'0', '5'},
-    {'0', '6'}, {'0', '7'}, {'0', '8'}, {'0', '9'}, {'1', '0'}, {'1', '1'},
-    {'1', '2'}, {'1', '3'}, {'1', '4'}, {'1', '5'}, {'1', '6'}, {'1', '7'},
-    {'1', '8'}, {'1', '9'}, {'2', '0'}, {'2', '1'}, {'2', '2'}, {'2', '3'},
-    {'2', '4'}, {'2', '5'}, {'2', '6'}, {'2', '7'}, {'2', '8'}, {'2', '9'},
-    {'3', '0'}, {'3', '1'}, {'3', '2'}, {'3', '3'}, {'3', '4'}, {'3', '5'},
-    {'3', '6'}, {'3', '7'}, {'3', '8'}, {'3', '9'}, {'4', '0'}, {'4', '1'},
-    {'4', '2'}, {'4', '3'}, {'4', '4'}, {'4', '5'}, {'4', '6'}, {'4', '7'},
-    {'4', '8'}, {'4', '9'}, {'5', '0'}, {'5', '1'}, {'5', '2'}, {'5', '3'},
-    {'5', '4'}, {'5', '5'}, {'5', '6'}, {'5', '7'}, {'5', '8'}, {'5', '9'},
-    {'6', '0'}, {'6', '1'}, {'6', '2'}, {'6', '3'}, {'6', '4'}, {'6', '5'},
-    {'6', '6'}, {'6', '7'}, {'6', '8'}, {'6', '9'}, {'7', '0'}, {'7', '1'},
-    {'7', '2'}, {'7', '3'}, {'7', '4'}, {'7', '5'}, {'7', '6'}, {'7', '7'},
-    {'7', '8'}, {'7', '9'}, {'8', '0'}, {'8', '1'}, {'8', '2'}, {'8', '3'},
-    {'8', '4'}, {'8', '5'}, {'8', '6'}, {'8', '7'}, {'8', '8'}, {'8', '9'},
-    {'9', '0'}, {'9', '1'}, {'9', '2'}, {'9', '3'}, {'9', '4'}, {'9', '5'},
-    {'9', '6'}, {'9', '7'}, {'9', '8'}, {'9', '9'}};
+// log10(2) = 0x0.4d104d427de7fbcc...
+static constexpr uint64_t log10_2_significand = 0x4d104d427de7fbcc;
 
-template <typename T>
-const char basic_data<T>::hex_digits[] = "0123456789abcdef";
+template <typename T = void> struct basic_impl_data {
+  // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
+  // These are generated by support/compute-powers.py.
+  static constexpr uint64_t pow10_significands[87] = {
+      0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
+      0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
+      0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
+      0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
+      0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
+      0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
+      0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
+      0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
+      0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
+      0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
+      0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
+      0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
+      0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
+      0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
+      0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
+      0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
+      0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
+      0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
+      0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
+      0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
+      0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
+      0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
+      0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
+      0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
+      0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
+      0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
+      0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
+      0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
+      0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
+  };
 
-#define FMT_POWERS_OF_10(factor)                                             \
-  factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
-      (factor)*1000000, (factor)*10000000, (factor)*100000000,               \
-      (factor)*1000000000
-
-template <typename T>
-const uint64_t basic_data<T>::powers_of_10_64[] = {
-    1, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
-    10000000000000000000ULL};
-
-template <typename T>
-const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0,
-                                                           FMT_POWERS_OF_10(1)};
-template <typename T>
-const uint64_t basic_data<T>::zero_or_powers_of_10_64[] = {
-    0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
-    10000000000000000000ULL};
-
-template <typename T>
-const uint32_t basic_data<T>::zero_or_powers_of_10_32_new[] = {
-    0, 0, FMT_POWERS_OF_10(1)};
-
-template <typename T>
-const uint64_t basic_data<T>::zero_or_powers_of_10_64_new[] = {
-    0, 0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
-    10000000000000000000ULL};
-
-// Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
-// These are generated by support/compute-powers.py.
-template <typename T>
-const uint64_t basic_data<T>::grisu_pow10_significands[] = {
-    0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
-    0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
-    0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
-    0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
-    0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
-    0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
-    0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
-    0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
-    0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
-    0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
-    0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
-    0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
-    0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
-    0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
-    0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
-    0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
-    0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
-    0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
-    0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
-    0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
-    0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
-    0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
-    0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
-    0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
-    0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
-    0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
-    0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
-    0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
-    0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
-};
-
-// Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
-// to significands above.
-template <typename T>
-const int16_t basic_data<T>::grisu_pow10_exponents[] = {
-    -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
-    -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
-    -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
-    -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
-    -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
-    242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
-    534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
-    827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
-
-template <typename T>
-const divtest_table_entry<uint32_t> basic_data<T>::divtest_table_for_pow5_32[] =
-    {{0x00000001, 0xffffffff}, {0xcccccccd, 0x33333333},
-     {0xc28f5c29, 0x0a3d70a3}, {0x26e978d5, 0x020c49ba},
-     {0x3afb7e91, 0x0068db8b}, {0x0bcbe61d, 0x0014f8b5},
-     {0x68c26139, 0x000431bd}, {0xae8d46a5, 0x0000d6bf},
-     {0x22e90e21, 0x00002af3}, {0x3a2e9c6d, 0x00000897},
-     {0x3ed61f49, 0x000001b7}};
-
-template <typename T>
-const divtest_table_entry<uint64_t> basic_data<T>::divtest_table_for_pow5_64[] =
-    {{0x0000000000000001, 0xffffffffffffffff},
-     {0xcccccccccccccccd, 0x3333333333333333},
-     {0x8f5c28f5c28f5c29, 0x0a3d70a3d70a3d70},
-     {0x1cac083126e978d5, 0x020c49ba5e353f7c},
-     {0xd288ce703afb7e91, 0x0068db8bac710cb2},
-     {0x5d4e8fb00bcbe61d, 0x0014f8b588e368f0},
-     {0x790fb65668c26139, 0x000431bde82d7b63},
-     {0xe5032477ae8d46a5, 0x0000d6bf94d5e57a},
-     {0xc767074b22e90e21, 0x00002af31dc46118},
-     {0x8e47ce423a2e9c6d, 0x0000089705f4136b},
-     {0x4fa7f60d3ed61f49, 0x000001b7cdfd9d7b},
-     {0x0fee64690c913975, 0x00000057f5ff85e5},
-     {0x3662e0e1cf503eb1, 0x000000119799812d},
-     {0xa47a2cf9f6433fbd, 0x0000000384b84d09},
-     {0x54186f653140a659, 0x00000000b424dc35},
-     {0x7738164770402145, 0x0000000024075f3d},
-     {0xe4a4d1417cd9a041, 0x000000000734aca5},
-     {0xc75429d9e5c5200d, 0x000000000170ef54},
-     {0xc1773b91fac10669, 0x000000000049c977},
-     {0x26b172506559ce15, 0x00000000000ec1e4},
-     {0xd489e3a9addec2d1, 0x000000000002f394},
-     {0x90e860bb892c8d5d, 0x000000000000971d},
-     {0x502e79bf1b6f4f79, 0x0000000000001e39},
-     {0xdcd618596be30fe5, 0x000000000000060b}};
-
-template <typename T>
-const uint64_t basic_data<T>::dragonbox_pow10_significands_64[] = {
-    0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
-    0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
-    0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
-    0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
-    0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
-    0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
-    0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
-    0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
-    0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
-    0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
-    0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
-    0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
-    0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
-    0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
-    0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
-    0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
-    0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
-    0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
-    0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
-    0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940984,
-    0xa18f07d736b90be5, 0xc9f2c9cd04674ede, 0xfc6f7c4045812296,
-    0x9dc5ada82b70b59d, 0xc5371912364ce305, 0xf684df56c3e01bc6,
-    0x9a130b963a6c115c, 0xc097ce7bc90715b3, 0xf0bdc21abb48db20,
-    0x96769950b50d88f4, 0xbc143fa4e250eb31, 0xeb194f8e1ae525fd,
-    0x92efd1b8d0cf37be, 0xb7abc627050305ad, 0xe596b7b0c643c719,
-    0x8f7e32ce7bea5c6f, 0xb35dbf821ae4f38b, 0xe0352f62a19e306e};
-
-template <typename T>
-const uint128_wrapper basic_data<T>::dragonbox_pow10_significands_128[] = {
-#if FMT_USE_FULL_CACHE_DRAGONBOX
-    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
-    {0x9faacf3df73609b1, 0x77b191618c54e9ad},
-    {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
-    {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
-    {0x9becce62836ac577, 0x4ee367f9430aec33},
-    {0xc2e801fb244576d5, 0x229c41f793cda740},
-    {0xf3a20279ed56d48a, 0x6b43527578c11110},
-    {0x9845418c345644d6, 0x830a13896b78aaaa},
-    {0xbe5691ef416bd60c, 0x23cc986bc656d554},
-    {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
-    {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
-    {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
-    {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
-    {0x91376c36d99995be, 0x23100809b9c21fa2},
-    {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
-    {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
-    {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
-    {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
-    {0xdd95317f31c7fa1d, 0x40405643d711d584},
-    {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
-    {0xad1c8eab5ee43b66, 0xda3243650005eed0},
-    {0xd863b256369d4a40, 0x90bed43e40076a83},
-    {0x873e4f75e2224e68, 0x5a7744a6e804a292},
-    {0xa90de3535aaae202, 0x711515d0a205cb37},
-    {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
-    {0x8412d9991ed58091, 0xe858790afe9486c3},
-    {0xa5178fff668ae0b6, 0x626e974dbe39a873},
-    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
-    {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
-    {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
-    {0xc987434744ac874e, 0xa327ffb266b56221},
-    {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
-    {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
-    {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
-    {0xf6019da07f549b2b, 0x7e2a53a146606a49},
-    {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
-    {0xc0314325637a1939, 0xfa911155fefb5309},
-    {0xf03d93eebc589f88, 0x793555ab7eba27cb},
-    {0x96267c7535b763b5, 0x4bc1558b2f3458df},
-    {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
-    {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
-    {0x92a1958a7675175f, 0x0bfacd89ec191eca},
-    {0xb749faed14125d36, 0xcef980ec671f667c},
-    {0xe51c79a85916f484, 0x82b7e12780e7401b},
-    {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
-    {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
-    {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
-    {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
-    {0xaecc49914078536d, 0x58fae9f773886e19},
-    {0xda7f5bf590966848, 0xaf39a475506a899f},
-    {0x888f99797a5e012d, 0x6d8406c952429604},
-    {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
-    {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
-    {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
-    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
-    {0xd0601d8efc57b08b, 0xf13b94daf124da27},
-    {0x823c12795db6ce57, 0x76c53d08d6b70859},
-    {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
-    {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
-    {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
-    {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
-    {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
-    {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
-    {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
-    {0xc21094364dfb5636, 0x985915fc12f542e5},
-    {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
-    {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
-    {0xbd8430bd08277231, 0x50c6ff782a838354},
-    {0xece53cec4a314ebd, 0xa4f8bf5635246429},
-    {0x940f4613ae5ed136, 0x871b7795e136be9a},
-    {0xb913179899f68584, 0x28e2557b59846e40},
-    {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
-    {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
-    {0xb4bca50b065abe63, 0x0fed077a756b53aa},
-    {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
-    {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
-    {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
-    {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
-    {0x89e42caaf9491b60, 0xf41686c49db57245},
-    {0xac5d37d5b79b6239, 0x311c2875c522ced6},
-    {0xd77485cb25823ac7, 0x7d633293366b828c},
-    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
-    {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
-    {0xd267caa862a12d66, 0xd072df63c324fd7c},
-    {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
-    {0xa46116538d0deb78, 0x52d9be85f074e609},
-    {0xcd795be870516656, 0x67902e276c921f8c},
-    {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
-    {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
-    {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
-    {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
-    {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
-    {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
-    {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
-    {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
-    {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
-    {0xef340a98172aace4, 0x86fb897116c87c35},
-    {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
-    {0xbae0a846d2195712, 0x8974836059cca10a},
-    {0xe998d258869facd7, 0x2bd1a438703fc94c},
-    {0x91ff83775423cc06, 0x7b6306a34627ddd0},
-    {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
-    {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
-    {0x8e938662882af53e, 0x547eb47b7282ee9d},
-    {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
-    {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
-    {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
-    {0xae0b158b4738705e, 0x9624ab50b148d446},
-    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
-    {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
-    {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
-    {0xd47487cc8470652b, 0x7647c32000696720},
-    {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
-    {0xa5fb0a17c777cf09, 0xf468107100525891},
-    {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
-    {0x81ac1fe293d599bf, 0xc6f14cd848405531},
-    {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
-    {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
-    {0xfd442e4688bd304a, 0x908f4a166d1da664},
-    {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
-    {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
-    {0xf7549530e188c128, 0xd12bee59e68ef47d},
-    {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
-    {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
-    {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
-    {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
-    {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
-    {0xebdf661791d60f56, 0x111b495b3464ad22},
-    {0x936b9fcebb25c995, 0xcab10dd900beec35},
-    {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
-    {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
-    {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
-    {0xb3f4e093db73a093, 0x59ed216765690f57},
-    {0xe0f218b8d25088b8, 0x306869c13ec3532d},
-    {0x8c974f7383725573, 0x1e414218c73a13fc},
-    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
-    {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
-    {0x894bc396ce5da772, 0x6b8bba8c328eb784},
-    {0xab9eb47c81f5114f, 0x066ea92f3f326565},
-    {0xd686619ba27255a2, 0xc80a537b0efefebe},
-    {0x8613fd0145877585, 0xbd06742ce95f5f37},
-    {0xa798fc4196e952e7, 0x2c48113823b73705},
-    {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
-    {0x82ef85133de648c4, 0x9a984d73dbe722fc},
-    {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
-    {0xcc963fee10b7d1b3, 0x318df905079926a9},
-    {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
-    {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
-    {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
-    {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
-    {0x9c1661a651213e2d, 0x06bea10ca65c084f},
-    {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
-    {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
-    {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
-    {0xbe89523386091465, 0xf6bbb397f1135824},
-    {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
-    {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
-    {0xba121a4650e4ddeb, 0x92f34d62616ce414},
-    {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
-    {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
-    {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
-    {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
-    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
-    {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
-    {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
-    {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
-    {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
-    {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
-    {0x87625f056c7c4a8b, 0x11471cd764ad4973},
-    {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
-    {0xd389b47879823479, 0x4aff1d108d4ec2c4},
-    {0x843610cb4bf160cb, 0xcedf722a585139bb},
-    {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
-    {0xce947a3da6a9273e, 0x733d226229feea33},
-    {0x811ccc668829b887, 0x0806357d5a3f5260},
-    {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
-    {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
-    {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
-    {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
-    {0xc5029163f384a931, 0x0a9e795e65d4df12},
-    {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
-    {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
-    {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
-    {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
-    {0x964e858c91ba2655, 0x3a6a07f8d510f870},
-    {0xbbe226efb628afea, 0x890489f70a55368c},
-    {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
-    {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
-    {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
-    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
-    {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
-    {0xb32df8e9f3546564, 0x47939822dc96abfa},
-    {0xdff9772470297ebd, 0x59787e2b93bc56f8},
-    {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
-    {0xaefae51477a06b03, 0xede622920b6b23f2},
-    {0xdab99e59958885c4, 0xe95fab368e45ecee},
-    {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
-    {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
-    {0xd59944a37c0752a2, 0x4be76d3346f04960},
-    {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
-    {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
-    {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
-    {0x825ecc24c873782f, 0x8ed400668c0c28c9},
-    {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
-    {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
-    {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
-    {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
-    {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
-    {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
-    {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
-    {0xc24452da229b021b, 0xfbe85badce996169},
-    {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
-    {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
-    {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
-    {0xed246723473e3813, 0x290123e9aab23b69},
-    {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
-    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
-    {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
-    {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
-    {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
-    {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
-    {0x8d590723948a535f, 0x579c487e5a38ad0f},
-    {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
-    {0xdcdb1b2798182244, 0xf8e431456cf88e66},
-    {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
-    {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
-    {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
-    {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
-    {0xa87fea27a539e9a5, 0x3f2398d747b36225},
-    {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
-    {0x83a3eeeef9153e89, 0x1953cf68300424ad},
-    {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
-    {0xcdb02555653131b6, 0x3792f412cb06794e},
-    {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
-    {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
-    {0xc8de047564d20a8b, 0xf245825a5a445276},
-    {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
-    {0x9ced737bb6c4183d, 0x55464dd69685606c},
-    {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
-    {0xf53304714d9265df, 0xd53dd99f4b3066a9},
-    {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
-    {0xbf8fdb78849a5f96, 0xde98520472bdd034},
-    {0xef73d256a5c0f77c, 0x963e66858f6d4441},
-    {0x95a8637627989aad, 0xdde7001379a44aa9},
-    {0xbb127c53b17ec159, 0x5560c018580d5d53},
-    {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
-    {0x9226712162ab070d, 0xcab3961304ca70e9},
-    {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
-    {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
-    {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
-    {0xb267ed1940f1c61c, 0x55f038b237591ed4},
-    {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
-    {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
-    {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
-    {0xd9c7dced53c72255, 0x96e7bd358c904a22},
-    {0x881cea14545c7575, 0x7e50d64177da2e55},
-    {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
-    {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
-    {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
-    {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
-    {0xcfb11ead453994ba, 0x67de18eda5814af3},
-    {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
-    {0xa2425ff75e14fc31, 0xa1258379a94d028e},
-    {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
-    {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
-    {0x9e74d1b791e07e48, 0x775ea264cf55347e},
-    {0xc612062576589dda, 0x95364afe032a819e},
-    {0xf79687aed3eec551, 0x3a83ddbd83f52205},
-    {0x9abe14cd44753b52, 0xc4926a9672793543},
-    {0xc16d9a0095928a27, 0x75b7053c0f178294},
-    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
-    {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
-    {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
-    {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
-    {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
-    {0xb877aa3236a4b449, 0x09befeb9fad487c3},
-    {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
-    {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
-    {0xb424dc35095cd80f, 0x538484c19ef38c95},
-    {0xe12e13424bb40e13, 0x2865a5f206b06fba},
-    {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
-    {0xafebff0bcb24aafe, 0xf78f69a51539d749},
-    {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
-    {0x89705f4136b4a597, 0x31680a88f8953031},
-    {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
-    {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
-    {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
-    {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
-    {0xd1b71758e219652b, 0xd3c36113404ea4a9},
-    {0x83126e978d4fdf3b, 0x645a1cac083126ea},
-    {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
-    {0xcccccccccccccccc, 0xcccccccccccccccd},
-    {0x8000000000000000, 0x0000000000000000},
-    {0xa000000000000000, 0x0000000000000000},
-    {0xc800000000000000, 0x0000000000000000},
-    {0xfa00000000000000, 0x0000000000000000},
-    {0x9c40000000000000, 0x0000000000000000},
-    {0xc350000000000000, 0x0000000000000000},
-    {0xf424000000000000, 0x0000000000000000},
-    {0x9896800000000000, 0x0000000000000000},
-    {0xbebc200000000000, 0x0000000000000000},
-    {0xee6b280000000000, 0x0000000000000000},
-    {0x9502f90000000000, 0x0000000000000000},
-    {0xba43b74000000000, 0x0000000000000000},
-    {0xe8d4a51000000000, 0x0000000000000000},
-    {0x9184e72a00000000, 0x0000000000000000},
-    {0xb5e620f480000000, 0x0000000000000000},
-    {0xe35fa931a0000000, 0x0000000000000000},
-    {0x8e1bc9bf04000000, 0x0000000000000000},
-    {0xb1a2bc2ec5000000, 0x0000000000000000},
-    {0xde0b6b3a76400000, 0x0000000000000000},
-    {0x8ac7230489e80000, 0x0000000000000000},
-    {0xad78ebc5ac620000, 0x0000000000000000},
-    {0xd8d726b7177a8000, 0x0000000000000000},
-    {0x878678326eac9000, 0x0000000000000000},
-    {0xa968163f0a57b400, 0x0000000000000000},
-    {0xd3c21bcecceda100, 0x0000000000000000},
-    {0x84595161401484a0, 0x0000000000000000},
-    {0xa56fa5b99019a5c8, 0x0000000000000000},
-    {0xcecb8f27f4200f3a, 0x0000000000000000},
-    {0x813f3978f8940984, 0x4000000000000000},
-    {0xa18f07d736b90be5, 0x5000000000000000},
-    {0xc9f2c9cd04674ede, 0xa400000000000000},
-    {0xfc6f7c4045812296, 0x4d00000000000000},
-    {0x9dc5ada82b70b59d, 0xf020000000000000},
-    {0xc5371912364ce305, 0x6c28000000000000},
-    {0xf684df56c3e01bc6, 0xc732000000000000},
-    {0x9a130b963a6c115c, 0x3c7f400000000000},
-    {0xc097ce7bc90715b3, 0x4b9f100000000000},
-    {0xf0bdc21abb48db20, 0x1e86d40000000000},
-    {0x96769950b50d88f4, 0x1314448000000000},
-    {0xbc143fa4e250eb31, 0x17d955a000000000},
-    {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
-    {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
-    {0xb7abc627050305ad, 0xf14a3d9e40000000},
-    {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
-    {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
-    {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
-    {0xe0352f62a19e306e, 0xd50b2037ad200000},
-    {0x8c213d9da502de45, 0x4526f422cc340000},
-    {0xaf298d050e4395d6, 0x9670b12b7f410000},
-    {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
-    {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
-    {0xab0e93b6efee0053, 0x8eea0d047a457a00},
-    {0xd5d238a4abe98068, 0x72a4904598d6d880},
-    {0x85a36366eb71f041, 0x47a6da2b7f864750},
-    {0xa70c3c40a64e6c51, 0x999090b65f67d924},
-    {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
-    {0x82818f1281ed449f, 0xbff8f10e7a8921a4},
-    {0xa321f2d7226895c7, 0xaff72d52192b6a0d},
-    {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490},
-    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
-    {0x9f4f2726179a2245, 0x01d762422c946590},
-    {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5},
-    {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2},
-    {0x9b934c3b330c8577, 0x63cc55f49f88eb2f},
-    {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb},
-    {0xf316271c7fc3908a, 0x8bef464e3945ef7a},
-    {0x97edd871cfda3a56, 0x97758bf0e3cbb5ac},
-    {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317},
-    {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd},
-    {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a},
-    {0xb975d6b6ee39e436, 0xb3e2fd538e122b44},
-    {0xe7d34c64a9c85d44, 0x60dbbca87196b616},
-    {0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd},
-    {0xb51d13aea4a488dd, 0x6babab6398bdbe41},
-    {0xe264589a4dcdab14, 0xc696963c7eed2dd1},
-    {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2},
-    {0xb0de65388cc8ada8, 0x3b25a55f43294bcb},
-    {0xdd15fe86affad912, 0x49ef0eb713f39ebe},
-    {0x8a2dbf142dfcc7ab, 0x6e3569326c784337},
-    {0xacb92ed9397bf996, 0x49c2c37f07965404},
-    {0xd7e77a8f87daf7fb, 0xdc33745ec97be906},
-    {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3},
-    {0xa8acd7c0222311bc, 0xc40832ea0d68ce0c},
-    {0xd2d80db02aabd62b, 0xf50a3fa490c30190},
-    {0x83c7088e1aab65db, 0x792667c6da79e0fa},
-    {0xa4b8cab1a1563f52, 0x577001b891185938},
-    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
-    {0x80b05e5ac60b6178, 0x544f8158315b05b4},
-    {0xa0dc75f1778e39d6, 0x696361ae3db1c721},
-    {0xc913936dd571c84c, 0x03bc3a19cd1e38e9},
-    {0xfb5878494ace3a5f, 0x04ab48a04065c723},
-    {0x9d174b2dcec0e47b, 0x62eb0d64283f9c76},
-    {0xc45d1df942711d9a, 0x3ba5d0bd324f8394},
-    {0xf5746577930d6500, 0xca8f44ec7ee36479},
-    {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb},
-    {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e},
-    {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e},
-    {0x95d04aee3b80ece5, 0xbba1f1d158724a12},
-    {0xbb445da9ca61281f, 0x2a8a6e45ae8edc97},
-    {0xea1575143cf97226, 0xf52d09d71a3293bd},
-    {0x924d692ca61be758, 0x593c2626705f9c56},
-    {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c},
-    {0xe498f455c38b997a, 0x0b6dfb9c0f956447},
-    {0x8edf98b59a373fec, 0x4724bd4189bd5eac},
-    {0xb2977ee300c50fe7, 0x58edec91ec2cb657},
-    {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed},
-    {0x8b865b215899f46c, 0xbd79e0d20082ee74},
-    {0xae67f1e9aec07187, 0xecd8590680a3aa11},
-    {0xda01ee641a708de9, 0xe80e6f4820cc9495},
-    {0x884134fe908658b2, 0x3109058d147fdcdd},
-    {0xaa51823e34a7eede, 0xbd4b46f0599fd415},
-    {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a},
-    {0x850fadc09923329e, 0x03e2cf6bc604ddb0},
-    {0xa6539930bf6bff45, 0x84db8346b786151c},
-    {0xcfe87f7cef46ff16, 0xe612641865679a63},
-    {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e},
-    {0xa26da3999aef7749, 0xe3be5e330f38f09d},
-    {0xcb090c8001ab551c, 0x5cadf5bfd3072cc5},
-    {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6},
-    {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa},
-    {0xc646d63501a1511d, 0xb281e1fd541501b8},
-    {0xf7d88bc24209a565, 0x1f225a7ca91a4226},
-    {0x9ae757596946075f, 0x3375788de9b06958},
-    {0xc1a12d2fc3978937, 0x0052d6b1641c83ae},
-    {0xf209787bb47d6b84, 0xc0678c5dbd23a49a},
-    {0x9745eb4d50ce6332, 0xf840b7ba963646e0},
-    {0xbd176620a501fbff, 0xb650e5a93bc3d898},
-    {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe},
-    {0x93ba47c980e98cdf, 0xc66f336c36b10137},
-    {0xb8a8d9bbe123f017, 0xb80b0047445d4184},
-    {0xe6d3102ad96cec1d, 0xa60dc059157491e5},
-    {0x9043ea1ac7e41392, 0x87c89837ad68db2f},
-    {0xb454e4a179dd1877, 0x29babe4598c311fb},
-    {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a},
-    {0x8ce2529e2734bb1d, 0x1899e4a65f58660c},
-    {0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f},
-    {0xdc21a1171d42645d, 0x76707543f4fa1f73},
-    {0x899504ae72497eba, 0x6a06494a791c53a8},
-    {0xabfa45da0edbde69, 0x0487db9d17636892},
-    {0xd6f8d7509292d603, 0x45a9d2845d3c42b6},
-    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
-    {0xa7f26836f282b732, 0x8e6cac7768d7141e},
-    {0xd1ef0244af2364ff, 0x3207d795430cd926},
-    {0x8335616aed761f1f, 0x7f44e6bd49e807b8},
-    {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6},
-    {0xcd036837130890a1, 0x36dba887c37a8c0f},
-    {0x802221226be55a64, 0xc2494954da2c9789},
-    {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c},
-    {0xc83553c5c8965d3d, 0x6f92829494e5acc7},
-    {0xfa42a8b73abbf48c, 0xcb772339ba1f17f9},
-    {0x9c69a97284b578d7, 0xff2a760414536efb},
-    {0xc38413cf25e2d70d, 0xfef5138519684aba},
-    {0xf46518c2ef5b8cd1, 0x7eb258665fc25d69},
-    {0x98bf2f79d5993802, 0xef2f773ffbd97a61},
-    {0xbeeefb584aff8603, 0xaafb550ffacfd8fa},
-    {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38},
-    {0x952ab45cfa97a0b2, 0xdd945a747bf26183},
-    {0xba756174393d88df, 0x94f971119aeef9e4},
-    {0xe912b9d1478ceb17, 0x7a37cd5601aab85d},
-    {0x91abb422ccb812ee, 0xac62e055c10ab33a},
-    {0xb616a12b7fe617aa, 0x577b986b314d6009},
-    {0xe39c49765fdf9d94, 0xed5a7e85fda0b80b},
-    {0x8e41ade9fbebc27d, 0x14588f13be847307},
-    {0xb1d219647ae6b31c, 0x596eb2d8ae258fc8},
-    {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb},
-    {0x8aec23d680043bee, 0x25de7bb9480d5854},
-    {0xada72ccc20054ae9, 0xaf561aa79a10ae6a},
-    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
-    {0x87aa9aff79042286, 0x90fb44d2f05d0842},
-    {0xa99541bf57452b28, 0x353a1607ac744a53},
-    {0xd3fa922f2d1675f2, 0x42889b8997915ce8},
-    {0x847c9b5d7c2e09b7, 0x69956135febada11},
-    {0xa59bc234db398c25, 0x43fab9837e699095},
-    {0xcf02b2c21207ef2e, 0x94f967e45e03f4bb},
-    {0x8161afb94b44f57d, 0x1d1be0eebac278f5},
-    {0xa1ba1ba79e1632dc, 0x6462d92a69731732},
-    {0xca28a291859bbf93, 0x7d7b8f7503cfdcfe},
-    {0xfcb2cb35e702af78, 0x5cda735244c3d43e},
-    {0x9defbf01b061adab, 0x3a0888136afa64a7},
-    {0xc56baec21c7a1916, 0x088aaa1845b8fdd0},
-    {0xf6c69a72a3989f5b, 0x8aad549e57273d45},
-    {0x9a3c2087a63f6399, 0x36ac54e2f678864b},
-    {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd},
-    {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5},
-    {0x969eb7c47859e743, 0x9f644ae5a4b1b325},
-    {0xbc4665b596706114, 0x873d5d9f0dde1fee},
-    {0xeb57ff22fc0c7959, 0xa90cb506d155a7ea},
-    {0x9316ff75dd87cbd8, 0x09a7f12442d588f2},
-    {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb2f},
-    {0xe5d3ef282a242e81, 0x8f1668c8a86da5fa},
-    {0x8fa475791a569d10, 0xf96e017d694487bc},
-    {0xb38d92d760ec4455, 0x37c981dcc395a9ac},
-    {0xe070f78d3927556a, 0x85bbe253f47b1417},
-    {0x8c469ab843b89562, 0x93956d7478ccec8e},
-    {0xaf58416654a6babb, 0x387ac8d1970027b2},
-    {0xdb2e51bfe9d0696a, 0x06997b05fcc0319e},
-    {0x88fcf317f22241e2, 0x441fece3bdf81f03},
-    {0xab3c2fddeeaad25a, 0xd527e81cad7626c3},
-    {0xd60b3bd56a5586f1, 0x8a71e223d8d3b074},
-    {0x85c7056562757456, 0xf6872d5667844e49},
-    {0xa738c6bebb12d16c, 0xb428f8ac016561db},
-    {0xd106f86e69d785c7, 0xe13336d701beba52},
-    {0x82a45b450226b39c, 0xecc0024661173473},
-    {0xa34d721642b06084, 0x27f002d7f95d0190},
-    {0xcc20ce9bd35c78a5, 0x31ec038df7b441f4},
-    {0xff290242c83396ce, 0x7e67047175a15271},
-    {0x9f79a169bd203e41, 0x0f0062c6e984d386},
-    {0xc75809c42c684dd1, 0x52c07b78a3e60868},
-    {0xf92e0c3537826145, 0xa7709a56ccdf8a82},
-    {0x9bbcc7a142b17ccb, 0x88a66076400bb691},
-    {0xc2abf989935ddbfe, 0x6acff893d00ea435},
-    {0xf356f7ebf83552fe, 0x0583f6b8c4124d43},
-    {0x98165af37b2153de, 0xc3727a337a8b704a},
-    {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c},
-    {0xeda2ee1c7064130c, 0x1162def06f79df73},
-    {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8},
-    {0xb9a74a0637ce2ee1, 0x6d953e2bd7173692},
-    {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437},
-    {0x910ab1d4db9914a0, 0x1d9c9892400a22a2},
-    {0xb54d5e4a127f59c8, 0x2503beb6d00cab4b},
-    {0xe2a0b5dc971f303a, 0x2e44ae64840fd61d},
-    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
-    {0xb10d8e1456105dad, 0x7425a83e872c5f47},
-    {0xdd50f1996b947518, 0xd12f124e28f77719},
-    {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f},
-    {0xace73cbfdc0bfb7b, 0x636cc64d1001550b},
-    {0xd8210befd30efa5a, 0x3c47f7e05401aa4e},
-    {0x8714a775e3e95c78, 0x65acfaec34810a71},
-    {0xa8d9d1535ce3b396, 0x7f1839a741a14d0d},
-    {0xd31045a8341ca07c, 0x1ede48111209a050},
-    {0x83ea2b892091e44d, 0x934aed0aab460432},
-    {0xa4e4b66b68b65d60, 0xf81da84d5617853f},
-    {0xce1de40642e3f4b9, 0x36251260ab9d668e},
-    {0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019},
-    {0xa1075a24e4421730, 0xb24cf65b8612f81f},
-    {0xc94930ae1d529cfc, 0xdee033f26797b627},
-    {0xfb9b7cd9a4a7443c, 0x169840ef017da3b1},
-    {0x9d412e0806e88aa5, 0x8e1f289560ee864e},
-    {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2},
-    {0xf5b5d7ec8acb58a2, 0xae10af696774b1db},
-    {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29},
-    {0xbff610b0cc6edd3f, 0x17fd090a58d32af3},
-    {0xeff394dcff8a948e, 0xddfc4b4cef07f5b0},
-    {0x95f83d0a1fb69cd9, 0x4abdaf101564f98e},
-    {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1},
-    {0xea53df5fd18d5513, 0x84c86189216dc5ed},
-    {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4},
-    {0xb7118682dbb66a77, 0x3fbc8c33221dc2a1},
-    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
-    {0x8f05b1163ba6832d, 0x29cb4d87f2a7400e},
-    {0xb2c71d5bca9023f8, 0x743e20e9ef511012},
-    {0xdf78e4b2bd342cf6, 0x914da9246b255416},
-    {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e},
-    {0xae9672aba3d0c320, 0xa184ac2473b529b1},
-    {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e},
-    {0x8865899617fb1871, 0x7e2fa67c7a658892},
-    {0xaa7eebfb9df9de8d, 0xddbb901b98feeab7},
-    {0xd51ea6fa85785631, 0x552a74227f3ea565},
-    {0x8533285c936b35de, 0xd53a88958f87275f},
-    {0xa67ff273b8460356, 0x8a892abaf368f137},
-    {0xd01fef10a657842c, 0x2d2b7569b0432d85},
-    {0x8213f56a67f6b29b, 0x9c3b29620e29fc73},
-    {0xa298f2c501f45f42, 0x8349f3ba91b47b8f},
-    {0xcb3f2f7642717713, 0x241c70a936219a73},
-    {0xfe0efb53d30dd4d7, 0xed238cd383aa0110},
-    {0x9ec95d1463e8a506, 0xf4363804324a40aa},
-    {0xc67bb4597ce2ce48, 0xb143c6053edcd0d5},
-    {0xf81aa16fdc1b81da, 0xdd94b7868e94050a},
-    {0x9b10a4e5e9913128, 0xca7cf2b4191c8326},
-    {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0},
-    {0xf24a01a73cf2dccf, 0xbc633b39673c8cec},
-    {0x976e41088617ca01, 0xd5be0503e085d813},
-    {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18},
-    {0xec9c459d51852ba2, 0xddf8e7d60ed1219e},
-    {0x93e1ab8252f33b45, 0xcabb90e5c942b503},
-    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
-    {0xe7109bfba19c0c9d, 0x0cc512670a783ad4},
-    {0x906a617d450187e2, 0x27fb2b80668b24c5},
-    {0xb484f9dc9641e9da, 0xb1f9f660802dedf6},
-    {0xe1a63853bbd26451, 0x5e7873f8a0396973},
-    {0x8d07e33455637eb2, 0xdb0b487b6423e1e8},
-    {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62},
-    {0xdc5c5301c56b75f7, 0x7641a140cc7810fb},
-    {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d},
-    {0xac2820d9623bf429, 0x546345fa9fbdcd44},
-    {0xd732290fbacaf133, 0xa97c177947ad4095},
-    {0x867f59a9d4bed6c0, 0x49ed8eabcccc485d},
-    {0xa81f301449ee8c70, 0x5c68f256bfff5a74},
-    {0xd226fc195c6a2f8c, 0x73832eec6fff3111},
-    {0x83585d8fd9c25db7, 0xc831fd53c5ff7eab},
-    {0xa42e74f3d032f525, 0xba3e7ca8b77f5e55},
-    {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb},
-    {0x80444b5e7aa7cf85, 0x7980d163cf5b81b3},
-    {0xa0555e361951c366, 0xd7e105bcc332621f},
-    {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7},
-    {0xfa856334878fc150, 0xb14f98f6f0feb951},
-    {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3},
-    {0xc3b8358109e84f07, 0x0a862f80ec4700c8},
-    {0xf4a642e14c6262c8, 0xcd27bb612758c0fa},
-    {0x98e7e9cccfbd7dbd, 0x8038d51cb897789c},
-    {0xbf21e44003acdd2c, 0xe0470a63e6bd56c3},
-    {0xeeea5d5004981478, 0x1858ccfce06cac74},
-    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8},
-    {0xbaa718e68396cffd, 0xd30560258f54e6ba},
-    {0xe950df20247c83fd, 0x47c6b82ef32a2069},
-    {0x91d28b7416cdd27e, 0x4cdc331d57fa5441},
-    {0xb6472e511c81471d, 0xe0133fe4adf8e952},
-    {0xe3d8f9e563a198e5, 0x58180fddd97723a6},
-    {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648},
-    {0xb201833b35d63f73, 0x2cd2cc6551e513da},
-    {0xde81e40a034bcf4f, 0xf8077f7ea65e58d1},
-    {0x8b112e86420f6191, 0xfb04afaf27faf782},
-    {0xadd57a27d29339f6, 0x79c5db9af1f9b563},
-    {0xd94ad8b1c7380874, 0x18375281ae7822bc},
-    {0x87cec76f1c830548, 0x8f2293910d0b15b5},
-    {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb22},
-    {0xd433179d9c8cb841, 0x5fa60692a46151eb},
-    {0x849feec281d7f328, 0xdbc7c41ba6bcd333},
-    {0xa5c7ea73224deff3, 0x12b9b522906c0800},
-    {0xcf39e50feae16bef, 0xd768226b34870a00},
-    {0x81842f29f2cce375, 0xe6a1158300d46640},
-    {0xa1e53af46f801c53, 0x60495ae3c1097fd0},
-    {0xca5e89b18b602368, 0x385bb19cb14bdfc4},
-    {0xfcf62c1dee382c42, 0x46729e03dd9ed7b5},
-    {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d1},
-    {0xc5a05277621be293, 0xc7098b7305241885},
-    {0xf70867153aa2db38, 0xb8cbee4fc66d1ea7}
-#else
-    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
-    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
-    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
-    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
-    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
-    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
-    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
-    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
-    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
-    {0x95a8637627989aad, 0xdde7001379a44aa9},
-    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
-    {0xc350000000000000, 0x0000000000000000},
-    {0x9dc5ada82b70b59d, 0xf020000000000000},
-    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
-    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
-    {0xa6539930bf6bff45, 0x84db8346b786151c},
-    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
-    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
-    {0xaf58416654a6babb, 0x387ac8d1970027b2},
-    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
-    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
-    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
-    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8}
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wnarrowing"
 #endif
-};
-
-#if !FMT_USE_FULL_CACHE_DRAGONBOX
-template <typename T>
-const uint64_t basic_data<T>::powers_of_5_64[] = {
-    0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
-    0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
-    0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
-    0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
-    0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
-    0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
-    0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
-    0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
-    0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
-
-template <typename T>
-const uint32_t basic_data<T>::dragonbox_pow10_recovery_errors[] = {
-    0x50001400, 0x54044100, 0x54014555, 0x55954415, 0x54115555, 0x00000001,
-    0x50000000, 0x00104000, 0x54010004, 0x05004001, 0x55555544, 0x41545555,
-    0x54040551, 0x15445545, 0x51555514, 0x10000015, 0x00101100, 0x01100015,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04450514, 0x45414110,
-    0x55555145, 0x50544050, 0x15040155, 0x11054140, 0x50111514, 0x11451454,
-    0x00400541, 0x00000000, 0x55555450, 0x10056551, 0x10054011, 0x55551014,
-    0x69514555, 0x05151109, 0x00155555};
+  // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
+  // to significands above.
+  static constexpr int16_t pow10_exponents[87] = {
+      -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
+      -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
+      -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
+      -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
+      -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
+      242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
+      534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
+      827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+#  pragma GCC diagnostic pop
 #endif
 
+  static constexpr uint64_t power_of_10_64[20] = {
+      1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+};
+
+// This is a struct rather than an alias to avoid shadowing warnings in gcc.
+struct impl_data : basic_impl_data<> {};
+
+#if __cplusplus < 201703L
 template <typename T>
-const char basic_data<T>::foreground_color[] = "\x1b[38;2;";
-template <typename T>
-const char basic_data<T>::background_color[] = "\x1b[48;2;";
-template <typename T> const char basic_data<T>::reset_color[] = "\x1b[0m";
-template <typename T> const wchar_t basic_data<T>::wreset_color[] = L"\x1b[0m";
-template <typename T> const char basic_data<T>::signs[] = {0, '-', '+', ' '};
-template <typename T>
-const char basic_data<T>::left_padding_shifts[] = {31, 31, 0, 1, 0};
-template <typename T>
-const char basic_data<T>::right_padding_shifts[] = {0, 31, 0, 1, 0};
+constexpr uint64_t basic_impl_data<T>::pow10_significands[];
+template <typename T> constexpr int16_t basic_impl_data<T>::pow10_exponents[];
+template <typename T> constexpr uint64_t basic_impl_data<T>::power_of_10_64[];
+#endif
 
 template <typename T> struct bits {
   static FMT_CONSTEXPR_DECL const int value =
       static_cast<int>(sizeof(T) * std::numeric_limits<unsigned char>::digits);
 };
 
-class fp;
-template <int SHIFT = 0> fp normalize(fp value);
-
-// Lower (upper) boundary is a value half way between a floating-point value
-// and its predecessor (successor). Boundaries have the same exponent as the
-// value so only significands are stored.
-struct boundaries {
-  uint64_t lower;
-  uint64_t upper;
-};
-
-// A handmade floating-point number f * pow(2, e).
-class fp {
- private:
-  using significand_type = uint64_t;
-
-  template <typename Float>
-  using is_supported_float = bool_constant<sizeof(Float) == sizeof(uint64_t) ||
-                                           sizeof(Float) == sizeof(uint32_t)>;
-
- public:
-  significand_type f;
-  int e;
-
-  // All sizes are in bits.
+// Returns the number of significand bits in Float excluding the implicit bit.
+template <typename Float> constexpr int num_significand_bits() {
   // Subtract 1 to account for an implicit most significant bit in the
   // normalized form.
-  static FMT_CONSTEXPR_DECL const int double_significand_size =
-      std::numeric_limits<double>::digits - 1;
-  static FMT_CONSTEXPR_DECL const uint64_t implicit_bit =
-      1ULL << double_significand_size;
-  static FMT_CONSTEXPR_DECL const int significand_size =
-      bits<significand_type>::value;
+  return std::numeric_limits<Float>::digits - 1;
+}
 
-  fp() : f(0), e(0) {}
-  fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+// A floating-point number f * pow(2, e).
+struct fp {
+  uint64_t f;
+  int e;
 
-  // Constructs fp from an IEEE754 double. It is a template to prevent compile
-  // errors on platforms where double is not IEEE754.
-  template <typename Double> explicit fp(Double d) { assign(d); }
+  static constexpr const int num_significand_bits = bits<decltype(f)>::value;
+
+  constexpr fp() : f(0), e(0) {}
+  constexpr fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+
+  // Constructs fp from an IEEE754 floating-point number. It is a template to
+  // prevent compile errors on systems where n is not IEEE754.
+  template <typename Float> explicit FMT_CONSTEXPR fp(Float n) { assign(n); }
+
+  template <typename Float>
+  using is_supported = bool_constant<sizeof(Float) == sizeof(uint64_t) ||
+                                     sizeof(Float) == sizeof(uint32_t)>;
 
   // Assigns d to this and return true iff predecessor is closer than successor.
-  template <typename Float, FMT_ENABLE_IF(is_supported_float<Float>::value)>
-  bool assign(Float d) {
+  template <typename Float, FMT_ENABLE_IF(is_supported<Float>::value)>
+  FMT_CONSTEXPR bool assign(Float n) {
     // Assume float is in the format [sign][exponent][significand].
-    using limits = std::numeric_limits<Float>;
-    const int float_significand_size = limits::digits - 1;
-    const int exponent_size =
-        bits<Float>::value - float_significand_size - 1;  // -1 for sign
-    const uint64_t float_implicit_bit = 1ULL << float_significand_size;
-    const uint64_t significand_mask = float_implicit_bit - 1;
-    const uint64_t exponent_mask = (~0ULL >> 1) & ~significand_mask;
-    const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
+    const int num_float_significand_bits =
+        detail::num_significand_bits<Float>();
+    const uint64_t implicit_bit = 1ULL << num_float_significand_bits;
+    const uint64_t significand_mask = implicit_bit - 1;
     constexpr bool is_double = sizeof(Float) == sizeof(uint64_t);
-    auto u = bit_cast<conditional_t<is_double, uint64_t, uint32_t>>(d);
+    auto u = bit_cast<conditional_t<is_double, uint64_t, uint32_t>>(n);
     f = u & significand_mask;
+    const uint64_t exponent_mask = (~0ULL >> 1) & ~significand_mask;
     int biased_e =
-        static_cast<int>((u & exponent_mask) >> float_significand_size);
-    // Predecessor is closer if d is a normalized power of 2 (f == 0) other than
-    // the smallest normalized number (biased_e > 1).
+        static_cast<int>((u & exponent_mask) >> num_float_significand_bits);
+    // The predecessor is closer if n is a normalized power of 2 (f == 0) other
+    // than the smallest normalized number (biased_e > 1).
     bool is_predecessor_closer = f == 0 && biased_e > 1;
     if (biased_e != 0)
-      f += float_implicit_bit;
+      f += implicit_bit;
     else
       biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
-    e = biased_e - exponent_bias - float_significand_size;
+    const int exponent_bias = std::numeric_limits<Float>::max_exponent - 1;
+    e = biased_e - exponent_bias - num_float_significand_bits;
     return is_predecessor_closer;
   }
 
-  template <typename Float, FMT_ENABLE_IF(!is_supported_float<Float>::value)>
+  template <typename Float, FMT_ENABLE_IF(!is_supported<Float>::value)>
   bool assign(Float) {
-    *this = fp();
+    FMT_ASSERT(false, "");
     return false;
   }
 };
 
 // Normalizes the value converted from double and multiplied by (1 << SHIFT).
-template <int SHIFT> fp normalize(fp value) {
+template <int SHIFT = 0> FMT_CONSTEXPR fp normalize(fp value) {
   // Handle subnormals.
-  const auto shifted_implicit_bit = fp::implicit_bit << SHIFT;
+  const uint64_t implicit_bit = 1ULL << num_significand_bits<double>();
+  const auto shifted_implicit_bit = implicit_bit << SHIFT;
   while ((value.f & shifted_implicit_bit) == 0) {
     value.f <<= 1;
     --value.e;
   }
   // Subtract 1 to account for hidden bit.
   const auto offset =
-      fp::significand_size - fp::double_significand_size - SHIFT - 1;
+      fp::num_significand_bits - num_significand_bits<double>() - SHIFT - 1;
   value.f <<= offset;
   value.e -= offset;
   return value;
@@ -1175,7 +304,7 @@ template <int SHIFT> fp normalize(fp value) {
 inline bool operator==(fp x, fp y) { return x.f == y.f && x.e == y.e; }
 
 // Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
-inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
+FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
 #if FMT_USE_INT128
   auto product = static_cast<__uint128_t>(lhs) * rhs;
   auto f = static_cast<uint64_t>(product >> 64);
@@ -1192,15 +321,18 @@ inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
 #endif
 }
 
-inline fp operator*(fp x, fp y) { return {multiply(x.f, y.f), x.e + y.e + 64}; }
+FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
+  return {multiply(x.f, y.f), x.e + y.e + 64};
+}
 
 // Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
 // (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
-inline fp get_cached_power(int min_exponent, int& pow10_exponent) {
+FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
+                                         int& pow10_exponent) {
   const int shift = 32;
-  const auto significand = static_cast<int64_t>(data::log10_2_significand);
+  const auto significand = static_cast<int64_t>(log10_2_significand);
   int index = static_cast<int>(
-      ((min_exponent + fp::significand_size - 1) * (significand >> shift) +
+      ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
        ((int64_t(1) << shift) - 1))  // ceil
       >> 32                          // arithmetic shift
   );
@@ -1210,8 +342,8 @@ inline fp get_cached_power(int min_exponent, int& pow10_exponent) {
   const int dec_exp_step = 8;
   index = (index - first_dec_exp - 1) / dec_exp_step + 1;
   pow10_exponent = first_dec_exp + index * dec_exp_step;
-  return {data::grisu_pow10_significands[index],
-          data::grisu_pow10_exponents[index]};
+  return {impl_data::pow10_significands[index],
+          impl_data::pow10_exponents[index]};
 }
 
 // A simple accumulator to hold the sums of terms in bigint::square if uint128_t
@@ -1220,15 +352,17 @@ struct accumulator {
   uint64_t lower;
   uint64_t upper;
 
-  accumulator() : lower(0), upper(0) {}
-  explicit operator uint32_t() const { return static_cast<uint32_t>(lower); }
+  constexpr accumulator() : lower(0), upper(0) {}
+  constexpr explicit operator uint32_t() const {
+    return static_cast<uint32_t>(lower);
+  }
 
-  void operator+=(uint64_t n) {
+  FMT_CONSTEXPR void operator+=(uint64_t n) {
     lower += n;
     if (lower < n) ++upper;
   }
-  void operator>>=(int shift) {
-    assert(shift == 32);
+  FMT_CONSTEXPR void operator>>=(int shift) {
+    FMT_ASSERT(shift == 32, "");
     (void)shift;
     lower = (upper << 32) | (lower >> 32);
     upper >>= 32;
@@ -1245,27 +379,31 @@ class bigint {
   basic_memory_buffer<bigit, bigits_capacity> bigits_;
   int exp_;
 
-  bigit operator[](int index) const { return bigits_[to_unsigned(index)]; }
-  bigit& operator[](int index) { return bigits_[to_unsigned(index)]; }
+  FMT_CONSTEXPR20 bigit operator[](int index) const {
+    return bigits_[to_unsigned(index)];
+  }
+  FMT_CONSTEXPR20 bigit& operator[](int index) {
+    return bigits_[to_unsigned(index)];
+  }
 
   static FMT_CONSTEXPR_DECL const int bigit_bits = bits<bigit>::value;
 
   friend struct formatter<bigint>;
 
-  void subtract_bigits(int index, bigit other, bigit& borrow) {
+  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
     auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
     (*this)[index] = static_cast<bigit>(result);
     borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
   }
 
-  void remove_leading_zeros() {
+  FMT_CONSTEXPR20 void remove_leading_zeros() {
     int num_bigits = static_cast<int>(bigits_.size()) - 1;
     while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
     bigits_.resize(to_unsigned(num_bigits + 1));
   }
 
   // Computes *this -= other assuming aligned bigints and *this >= other.
-  void subtract_aligned(const bigint& other) {
+  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
     FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
     FMT_ASSERT(compare(*this, other) >= 0, "");
     bigit borrow = 0;
@@ -1276,7 +414,7 @@ class bigint {
     remove_leading_zeros();
   }
 
-  void multiply(uint32_t value) {
+  FMT_CONSTEXPR20 void multiply(uint32_t value) {
     const double_bigit wide_value = value;
     bigit carry = 0;
     for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
@@ -1287,7 +425,7 @@ class bigint {
     if (carry != 0) bigits_.push_back(carry);
   }
 
-  void multiply(uint64_t value) {
+  FMT_CONSTEXPR20 void multiply(uint64_t value) {
     const bigit mask = ~bigit(0);
     const double_bigit lower = value & mask;
     const double_bigit upper = value >> bigit_bits;
@@ -1305,14 +443,16 @@ class bigint {
   }
 
  public:
-  bigint() : exp_(0) {}
+  FMT_CONSTEXPR20 bigint() : exp_(0) {}
   explicit bigint(uint64_t n) { assign(n); }
-  ~bigint() { assert(bigits_.capacity() <= bigits_capacity); }
+  FMT_CONSTEXPR20 ~bigint() {
+    FMT_ASSERT(bigits_.capacity() <= bigits_capacity, "");
+  }
 
   bigint(const bigint&) = delete;
   void operator=(const bigint&) = delete;
 
-  void assign(const bigint& other) {
+  FMT_CONSTEXPR20 void assign(const bigint& other) {
     auto size = other.bigits_.size();
     bigits_.resize(size);
     auto data = other.bigits_.data();
@@ -1320,7 +460,7 @@ class bigint {
     exp_ = other.exp_;
   }
 
-  void assign(uint64_t n) {
+  FMT_CONSTEXPR20 void assign(uint64_t n) {
     size_t num_bigits = 0;
     do {
       bigits_[num_bigits++] = n & ~bigit(0);
@@ -1330,10 +470,12 @@ class bigint {
     exp_ = 0;
   }
 
-  int num_bigits() const { return static_cast<int>(bigits_.size()) + exp_; }
+  FMT_CONSTEXPR20 int num_bigits() const {
+    return static_cast<int>(bigits_.size()) + exp_;
+  }
 
-  FMT_NOINLINE bigint& operator<<=(int shift) {
-    assert(shift >= 0);
+  FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) {
+    FMT_ASSERT(shift >= 0, "");
     exp_ += shift / bigit_bits;
     shift %= bigit_bits;
     if (shift == 0) return *this;
@@ -1347,13 +489,13 @@ class bigint {
     return *this;
   }
 
-  template <typename Int> bigint& operator*=(Int value) {
+  template <typename Int> FMT_CONSTEXPR20 bigint& operator*=(Int value) {
     FMT_ASSERT(value > 0, "");
     multiply(uint32_or_64_or_128_t<Int>(value));
     return *this;
   }
 
-  friend int compare(const bigint& lhs, const bigint& rhs) {
+  friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) {
     int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
     if (num_lhs_bigits != num_rhs_bigits)
       return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
@@ -1370,8 +512,8 @@ class bigint {
   }
 
   // Returns compare(lhs1 + lhs2, rhs).
-  friend int add_compare(const bigint& lhs1, const bigint& lhs2,
-                         const bigint& rhs) {
+  friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2,
+                                         const bigint& rhs) {
     int max_lhs_bigits = (std::max)(lhs1.num_bigits(), lhs2.num_bigits());
     int num_rhs_bigits = rhs.num_bigits();
     if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
@@ -1394,8 +536,8 @@ class bigint {
   }
 
   // Assigns pow(10, exp) to this bigint.
-  void assign_pow10(int exp) {
-    assert(exp >= 0);
+  FMT_CONSTEXPR20 void assign_pow10(int exp) {
+    FMT_ASSERT(exp >= 0, "");
     if (exp == 0) return assign(1);
     // Find the top bit.
     int bitmask = 1;
@@ -1413,10 +555,10 @@ class bigint {
     *this <<= exp;  // Multiply by pow(2, exp) by shifting.
   }
 
-  void square() {
-    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+  FMT_CONSTEXPR20 void square() {
     int num_bigits = static_cast<int>(bigits_.size());
     int num_result_bigits = 2 * num_bigits;
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
     bigits_.resize(to_unsigned(num_result_bigits));
     using accumulator_t = conditional_t<FMT_USE_INT128, uint128_t, accumulator>;
     auto sum = accumulator_t();
@@ -1438,14 +580,13 @@ class bigint {
       (*this)[bigit_index] = static_cast<bigit>(sum);
       sum >>= bits<bigit>::value;
     }
-    --num_result_bigits;
     remove_leading_zeros();
     exp_ *= 2;
   }
 
   // If this bigint has a bigger exponent than other, adds trailing zero to make
   // exponents equal. This simplifies some operations such as subtraction.
-  void align(const bigint& other) {
+  FMT_CONSTEXPR20 void align(const bigint& other) {
     int exp_difference = exp_ - other.exp_;
     if (exp_difference <= 0) return;
     int num_bigits = static_cast<int>(bigits_.size());
@@ -1458,7 +599,7 @@ class bigint {
 
   // Divides this bignum by divisor, assigning the remainder to this and
   // returning the quotient.
-  int divmod_assign(const bigint& divisor) {
+  FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) {
     FMT_ASSERT(this != &divisor, "");
     if (compare(*this, divisor) < 0) return 0;
     FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
@@ -1478,8 +619,9 @@ enum class round_direction { unknown, up, down };
 // some number v and the error, returns whether v should be rounded up, down, or
 // whether the rounding direction can't be determined due to error.
 // error should be less than divisor / 2.
-inline round_direction get_round_direction(uint64_t divisor, uint64_t remainder,
-                                           uint64_t error) {
+FMT_CONSTEXPR inline round_direction get_round_direction(uint64_t divisor,
+                                                         uint64_t remainder,
+                                                         uint64_t error) {
   FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
   FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
   FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
@@ -1502,12 +644,52 @@ enum result {
 };
 }
 
+struct gen_digits_handler {
+  char* buf;
+  int size;
+  int precision;
+  int exp10;
+  bool fixed;
+
+  FMT_CONSTEXPR digits::result on_digit(char digit, uint64_t divisor,
+                                        uint64_t remainder, uint64_t error,
+                                        bool integral) {
+    FMT_ASSERT(remainder < divisor, "");
+    buf[size++] = digit;
+    if (!integral && error >= remainder) return digits::error;
+    if (size < precision) return digits::more;
+    if (!integral) {
+      // Check if error * 2 < divisor with overflow prevention.
+      // The check is not needed for the integral part because error = 1
+      // and divisor > (1 << 32) there.
+      if (error >= divisor || error >= divisor - error) return digits::error;
+    } else {
+      FMT_ASSERT(error == 1 && divisor > 2, "");
+    }
+    auto dir = get_round_direction(divisor, remainder, error);
+    if (dir != round_direction::up)
+      return dir == round_direction::down ? digits::done : digits::error;
+    ++buf[size - 1];
+    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
+      buf[i] = '0';
+      ++buf[i - 1];
+    }
+    if (buf[0] > '9') {
+      buf[0] = '1';
+      if (fixed)
+        buf[size++] = '0';
+      else
+        ++exp10;
+    }
+    return digits::done;
+  }
+};
+
 // Generates output using the Grisu digit-gen algorithm.
 // error: the size of the region (lower, upper) outside of which numbers
 // definitely do not round to value (Delta in Grisu3).
-template <typename Handler>
-FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
-                                                  int& exp, Handler& handler) {
+FMT_INLINE FMT_CONSTEXPR20 digits::result grisu_gen_digits(
+    fp value, uint64_t error, int& exp, gen_digits_handler& handler) {
   const fp one(1ULL << -value.e, value.e);
   // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
   // zero because it contains a product of two 64-bit numbers with MSB set (due
@@ -1518,10 +700,28 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
   // The fractional part of scaled value (p2 in Grisu) c = value % one.
   uint64_t fractional = value.f & (one.f - 1);
   exp = count_digits(integral);  // kappa in Grisu.
-  // Divide by 10 to prevent overflow.
-  auto result = handler.on_start(data::powers_of_10_64[exp - 1] << -one.e,
-                                 value.f / 10, error * 10, exp);
-  if (result != digits::more) return result;
+  // Non-fixed formats require at least one digit and no precision adjustment.
+  if (handler.fixed) {
+    // Adjust fixed precision by exponent because it is relative to decimal
+    // point.
+    int precision_offset = exp + handler.exp10;
+    if (precision_offset > 0 &&
+        handler.precision > max_value<int>() - precision_offset) {
+      FMT_THROW(format_error("number is too big"));
+    }
+    handler.precision += precision_offset;
+    // Check if precision is satisfied just by leading zeros, e.g.
+    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
+    if (handler.precision <= 0) {
+      if (handler.precision < 0) return digits::done;
+      // Divide by 10 to prevent overflow.
+      uint64_t divisor = impl_data::power_of_10_64[exp - 1] << -one.e;
+      auto dir = get_round_direction(divisor, value.f / 10, error * 10);
+      if (dir == round_direction::unknown) return digits::error;
+      handler.buf[handler.size++] = dir == round_direction::up ? '1' : '0';
+      return digits::done;
+    }
+  }
   // Generate digits for the integral part. This can produce up to 10 digits.
   do {
     uint32_t digit = 0;
@@ -1568,9 +768,9 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
     }
     --exp;
     auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
-    result = handler.on_digit(static_cast<char>('0' + digit),
-                              data::powers_of_10_64[exp] << -one.e, remainder,
-                              error, exp, true);
+    auto result = handler.on_digit(static_cast<char>('0' + digit),
+                                   impl_data::power_of_10_64[exp] << -one.e,
+                                   remainder, error, true);
     if (result != digits::more) return result;
   } while (exp > 0);
   // Generate digits for the fractional part.
@@ -1580,74 +780,63 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
     char digit = static_cast<char>('0' + (fractional >> -one.e));
     fractional &= one.f - 1;
     --exp;
-    result = handler.on_digit(digit, one.f, fractional, error, exp, false);
+    auto result = handler.on_digit(digit, one.f, fractional, error, false);
     if (result != digits::more) return result;
   }
 }
 
-// The fixed precision digit handler.
-struct fixed_handler {
-  char* buf;
-  int size;
-  int precision;
-  int exp10;
-  bool fixed;
+// A 128-bit integer type used internally,
+struct uint128_wrapper {
+  uint128_wrapper() = default;
 
-  digits::result on_start(uint64_t divisor, uint64_t remainder, uint64_t error,
-                          int& exp) {
-    // Non-fixed formats require at least one digit and no precision adjustment.
-    if (!fixed) return digits::more;
-    // Adjust fixed precision by exponent because it is relative to decimal
-    // point.
-    precision += exp + exp10;
-    // Check if precision is satisfied just by leading zeros, e.g.
-    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
-    if (precision > 0) return digits::more;
-    if (precision < 0) return digits::done;
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir == round_direction::unknown) return digits::error;
-    buf[size++] = dir == round_direction::up ? '1' : '0';
-    return digits::done;
-  }
+#if FMT_USE_INT128
+  uint128_t internal_;
 
-  digits::result on_digit(char digit, uint64_t divisor, uint64_t remainder,
-                          uint64_t error, int, bool integral) {
-    FMT_ASSERT(remainder < divisor, "");
-    buf[size++] = digit;
-    if (!integral && error >= remainder) return digits::error;
-    if (size < precision) return digits::more;
-    if (!integral) {
-      // Check if error * 2 < divisor with overflow prevention.
-      // The check is not needed for the integral part because error = 1
-      // and divisor > (1 << 32) there.
-      if (error >= divisor || error >= divisor - error) return digits::error;
-    } else {
-      FMT_ASSERT(error == 1 && divisor > 2, "");
-    }
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir != round_direction::up)
-      return dir == round_direction::down ? digits::done : digits::error;
-    ++buf[size - 1];
-    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
-      buf[i] = '0';
-      ++buf[i - 1];
-    }
-    if (buf[0] > '9') {
-      buf[0] = '1';
-      if (fixed)
-        buf[size++] = '0';
-      else
-        ++exp10;
-    }
-    return digits::done;
+  constexpr uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT
+      : internal_{static_cast<uint128_t>(low) |
+                  (static_cast<uint128_t>(high) << 64)} {}
+
+  constexpr uint128_wrapper(uint128_t u) : internal_{u} {}
+
+  constexpr uint64_t high() const FMT_NOEXCEPT {
+    return uint64_t(internal_ >> 64);
   }
+  constexpr uint64_t low() const FMT_NOEXCEPT { return uint64_t(internal_); }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+    internal_ += n;
+    return *this;
+  }
+#else
+  uint64_t high_;
+  uint64_t low_;
+
+  constexpr uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT
+      : high_{high},
+        low_{low} {}
+
+  constexpr uint64_t high() const FMT_NOEXCEPT { return high_; }
+  constexpr uint64_t low() const FMT_NOEXCEPT { return low_; }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+#  if defined(_MSC_VER) && defined(_M_X64)
+    unsigned char carry = _addcarry_u64(0, low_, n, &low_);
+    _addcarry_u64(carry, high_, 0, &high_);
+    return *this;
+#  else
+    uint64_t sum = low_ + n;
+    high_ += (sum < low_ ? 1 : 0);
+    low_ = sum;
+    return *this;
+#  endif
+  }
+#endif
 };
 
 // Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
 namespace dragonbox {
 // Computes 128-bit result of multiplication of two 64-bit unsigned integers.
-FMT_SAFEBUFFERS inline uint128_wrapper umul128(uint64_t x,
-                                               uint64_t y) FMT_NOEXCEPT {
+inline uint128_wrapper umul128(uint64_t x, uint64_t y) FMT_NOEXCEPT {
 #if FMT_USE_INT128
   return static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
 #elif defined(_MSC_VER) && defined(_M_X64)
@@ -1675,8 +864,7 @@ FMT_SAFEBUFFERS inline uint128_wrapper umul128(uint64_t x,
 }
 
 // Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
-FMT_SAFEBUFFERS inline uint64_t umul128_upper64(uint64_t x,
-                                                uint64_t y) FMT_NOEXCEPT {
+inline uint64_t umul128_upper64(uint64_t x, uint64_t y) FMT_NOEXCEPT {
 #if FMT_USE_INT128
   auto p = static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
   return static_cast<uint64_t>(p >> 64);
@@ -1689,8 +877,7 @@ FMT_SAFEBUFFERS inline uint64_t umul128_upper64(uint64_t x,
 
 // Computes upper 64 bits of multiplication of a 64-bit unsigned integer and a
 // 128-bit unsigned integer.
-FMT_SAFEBUFFERS inline uint64_t umul192_upper64(uint64_t x, uint128_wrapper y)
-    FMT_NOEXCEPT {
+inline uint64_t umul192_upper64(uint64_t x, uint128_wrapper y) FMT_NOEXCEPT {
   uint128_wrapper g0 = umul128(x, y.high());
   g0 += umul128_upper64(x, y.low());
   return g0.high();
@@ -1704,8 +891,7 @@ inline uint32_t umul96_upper32(uint32_t x, uint64_t y) FMT_NOEXCEPT {
 
 // Computes middle 64 bits of multiplication of a 64-bit unsigned integer and a
 // 128-bit unsigned integer.
-FMT_SAFEBUFFERS inline uint64_t umul192_middle64(uint64_t x, uint128_wrapper y)
-    FMT_NOEXCEPT {
+inline uint64_t umul192_middle64(uint64_t x, uint128_wrapper y) FMT_NOEXCEPT {
   uint64_t g01 = x * y.high();
   uint64_t g10 = umul128_upper64(x, y.low());
   return g01 + g10;
@@ -1722,8 +908,7 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) FMT_NOEXCEPT {
 inline int floor_log10_pow2(int e) FMT_NOEXCEPT {
   FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
   const int shift = 22;
-  return (e * static_cast<int>(data::log10_2_significand >> (64 - shift))) >>
-         shift;
+  return (e * static_cast<int>(log10_2_significand >> (64 - shift))) >> shift;
 }
 
 // Various fast log computations.
@@ -1741,8 +926,7 @@ inline int floor_log10_pow2_minus_log10_4_over_3(int e) FMT_NOEXCEPT {
   FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
   const uint64_t log10_4_over_3_fractional_digits = 0x1ffbfc2bbc780375;
   const int shift_amount = 22;
-  return (e * static_cast<int>(data::log10_2_significand >>
-                               (64 - shift_amount)) -
+  return (e * static_cast<int>(log10_2_significand >> (64 - shift_amount)) -
           static_cast<int>(log10_4_over_3_fractional_digits >>
                            (64 - shift_amount))) >>
          shift_amount;
@@ -1768,29 +952,76 @@ inline bool divisible_by_power_of_2(uint64_t x, int exp) FMT_NOEXCEPT {
 #endif
 }
 
+// Table entry type for divisibility test.
+template <typename T> struct divtest_table_entry {
+  T mod_inv;
+  T max_quotient;
+};
+
 // Returns true iff x is divisible by pow(5, exp).
 inline bool divisible_by_power_of_5(uint32_t x, int exp) FMT_NOEXCEPT {
   FMT_ASSERT(exp <= 10, "too large exponent");
-  return x * data::divtest_table_for_pow5_32[exp].mod_inv <=
-         data::divtest_table_for_pow5_32[exp].max_quotient;
+  static constexpr const divtest_table_entry<uint32_t> divtest_table[] = {
+      {0x00000001, 0xffffffff}, {0xcccccccd, 0x33333333},
+      {0xc28f5c29, 0x0a3d70a3}, {0x26e978d5, 0x020c49ba},
+      {0x3afb7e91, 0x0068db8b}, {0x0bcbe61d, 0x0014f8b5},
+      {0x68c26139, 0x000431bd}, {0xae8d46a5, 0x0000d6bf},
+      {0x22e90e21, 0x00002af3}, {0x3a2e9c6d, 0x00000897},
+      {0x3ed61f49, 0x000001b7}};
+  return x * divtest_table[exp].mod_inv <= divtest_table[exp].max_quotient;
 }
 inline bool divisible_by_power_of_5(uint64_t x, int exp) FMT_NOEXCEPT {
   FMT_ASSERT(exp <= 23, "too large exponent");
-  return x * data::divtest_table_for_pow5_64[exp].mod_inv <=
-         data::divtest_table_for_pow5_64[exp].max_quotient;
+  static constexpr const divtest_table_entry<uint64_t> divtest_table[] = {
+      {0x0000000000000001, 0xffffffffffffffff},
+      {0xcccccccccccccccd, 0x3333333333333333},
+      {0x8f5c28f5c28f5c29, 0x0a3d70a3d70a3d70},
+      {0x1cac083126e978d5, 0x020c49ba5e353f7c},
+      {0xd288ce703afb7e91, 0x0068db8bac710cb2},
+      {0x5d4e8fb00bcbe61d, 0x0014f8b588e368f0},
+      {0x790fb65668c26139, 0x000431bde82d7b63},
+      {0xe5032477ae8d46a5, 0x0000d6bf94d5e57a},
+      {0xc767074b22e90e21, 0x00002af31dc46118},
+      {0x8e47ce423a2e9c6d, 0x0000089705f4136b},
+      {0x4fa7f60d3ed61f49, 0x000001b7cdfd9d7b},
+      {0x0fee64690c913975, 0x00000057f5ff85e5},
+      {0x3662e0e1cf503eb1, 0x000000119799812d},
+      {0xa47a2cf9f6433fbd, 0x0000000384b84d09},
+      {0x54186f653140a659, 0x00000000b424dc35},
+      {0x7738164770402145, 0x0000000024075f3d},
+      {0xe4a4d1417cd9a041, 0x000000000734aca5},
+      {0xc75429d9e5c5200d, 0x000000000170ef54},
+      {0xc1773b91fac10669, 0x000000000049c977},
+      {0x26b172506559ce15, 0x00000000000ec1e4},
+      {0xd489e3a9addec2d1, 0x000000000002f394},
+      {0x90e860bb892c8d5d, 0x000000000000971d},
+      {0x502e79bf1b6f4f79, 0x0000000000001e39},
+      {0xdcd618596be30fe5, 0x000000000000060b}};
+  return x * divtest_table[exp].mod_inv <= divtest_table[exp].max_quotient;
 }
 
+#define LAMMPS_WORKAROUND_FOR_DPCXX 1
 // Replaces n by floor(n / pow(5, N)) returning true if and only if n is
 // divisible by pow(5, N).
 // Precondition: n <= 2 * pow(5, N + 1).
 template <int N>
 bool check_divisibility_and_divide_by_pow5(uint32_t& n) FMT_NOEXCEPT {
+#if defined(LAMMPS_WORKAROUND_FOR_DPCXX)
+  struct infos1 {
+    uint32_t magic_number;
+    int bits_for_comparison;
+    uint32_t threshold;
+    int shift_amount;
+  };
+  static constexpr infos1 infos[] = {{0xcccd, 16, 0x3333, 18}, {0xa429, 8, 0x0a, 20}};
+#else
   static constexpr struct {
     uint32_t magic_number;
     int bits_for_comparison;
     uint32_t threshold;
     int shift_amount;
   } infos[] = {{0xcccd, 16, 0x3333, 18}, {0xa429, 8, 0x0a, 20}};
+#endif
   constexpr auto info = infos[N - 1];
   n *= info.magic_number;
   const uint32_t comparison_mask = (1u << info.bits_for_comparison) - 1;
@@ -1802,11 +1033,20 @@ bool check_divisibility_and_divide_by_pow5(uint32_t& n) FMT_NOEXCEPT {
 // Computes floor(n / pow(10, N)) for small n and N.
 // Precondition: n <= pow(10, N + 1).
 template <int N> uint32_t small_division_by_pow10(uint32_t n) FMT_NOEXCEPT {
+#if defined(LAMMPS_WORKAROUND_FOR_DPCXX)
+  struct infos2 {
+    uint32_t magic_number;
+    int shift_amount;
+    uint32_t divisor_times_10;
+  };
+  static constexpr infos2 infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}};
+#else
   static constexpr struct {
     uint32_t magic_number;
     int shift_amount;
     uint32_t divisor_times_10;
   } infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}};
+#endif
   constexpr auto info = infos[N - 1];
   FMT_ASSERT(n <= info.divisor_times_10, "n is too large");
   return n * info.magic_number >> info.shift_amount;
@@ -1831,7 +1071,34 @@ template <> struct cache_accessor<float> {
   static uint64_t get_cached_power(int k) FMT_NOEXCEPT {
     FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
                "k is out of range");
-    return data::dragonbox_pow10_significands_64[k - float_info<float>::min_k];
+    static constexpr const uint64_t pow10_significands[] = {
+        0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+        0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+        0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+        0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+        0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+        0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+        0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+        0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+        0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+        0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+        0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+        0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+        0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+        0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+        0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+        0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+        0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+        0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+        0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+        0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940984,
+        0xa18f07d736b90be5, 0xc9f2c9cd04674ede, 0xfc6f7c4045812296,
+        0x9dc5ada82b70b59d, 0xc5371912364ce305, 0xf684df56c3e01bc6,
+        0x9a130b963a6c115c, 0xc097ce7bc90715b3, 0xf0bdc21abb48db20,
+        0x96769950b50d88f4, 0xbc143fa4e250eb31, 0xeb194f8e1ae525fd,
+        0x92efd1b8d0cf37be, 0xb7abc627050305ad, 0xe596b7b0c643c719,
+        0x8f7e32ce7bea5c6f, 0xb35dbf821ae4f38b, 0xe0352f62a19e306e};
+    return pow10_significands[k - float_info<float>::min_k];
   }
 
   static carrier_uint compute_mul(carrier_uint u,
@@ -1885,10 +1152,679 @@ template <> struct cache_accessor<double> {
     FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
                "k is out of range");
 
+    static constexpr const uint128_wrapper pow10_significands[] = {
 #if FMT_USE_FULL_CACHE_DRAGONBOX
-    return data::dragonbox_pow10_significands_128[k -
-                                                  float_info<double>::min_k];
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+      {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+      {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+      {0x9becce62836ac577, 0x4ee367f9430aec33},
+      {0xc2e801fb244576d5, 0x229c41f793cda740},
+      {0xf3a20279ed56d48a, 0x6b43527578c11110},
+      {0x9845418c345644d6, 0x830a13896b78aaaa},
+      {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+      {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+      {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+      {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+      {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+      {0x91376c36d99995be, 0x23100809b9c21fa2},
+      {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+      {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+      {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+      {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+      {0xdd95317f31c7fa1d, 0x40405643d711d584},
+      {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+      {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+      {0xd863b256369d4a40, 0x90bed43e40076a83},
+      {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+      {0xa90de3535aaae202, 0x711515d0a205cb37},
+      {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+      {0x8412d9991ed58091, 0xe858790afe9486c3},
+      {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+      {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+      {0xc987434744ac874e, 0xa327ffb266b56221},
+      {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+      {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+      {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+      {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+      {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+      {0xc0314325637a1939, 0xfa911155fefb5309},
+      {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+      {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+      {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+      {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+      {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+      {0xb749faed14125d36, 0xcef980ec671f667c},
+      {0xe51c79a85916f484, 0x82b7e12780e7401b},
+      {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+      {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+      {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+      {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+      {0xaecc49914078536d, 0x58fae9f773886e19},
+      {0xda7f5bf590966848, 0xaf39a475506a899f},
+      {0x888f99797a5e012d, 0x6d8406c952429604},
+      {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+      {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+      {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+      {0x823c12795db6ce57, 0x76c53d08d6b70859},
+      {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+      {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+      {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+      {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+      {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+      {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+      {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+      {0xc21094364dfb5636, 0x985915fc12f542e5},
+      {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+      {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+      {0xbd8430bd08277231, 0x50c6ff782a838354},
+      {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+      {0x940f4613ae5ed136, 0x871b7795e136be9a},
+      {0xb913179899f68584, 0x28e2557b59846e40},
+      {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+      {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+      {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+      {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+      {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+      {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+      {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+      {0x89e42caaf9491b60, 0xf41686c49db57245},
+      {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+      {0xd77485cb25823ac7, 0x7d633293366b828c},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+      {0xd267caa862a12d66, 0xd072df63c324fd7c},
+      {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+      {0xa46116538d0deb78, 0x52d9be85f074e609},
+      {0xcd795be870516656, 0x67902e276c921f8c},
+      {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+      {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+      {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+      {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+      {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+      {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+      {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+      {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+      {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+      {0xef340a98172aace4, 0x86fb897116c87c35},
+      {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+      {0xbae0a846d2195712, 0x8974836059cca10a},
+      {0xe998d258869facd7, 0x2bd1a438703fc94c},
+      {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+      {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+      {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+      {0x8e938662882af53e, 0x547eb47b7282ee9d},
+      {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+      {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+      {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+      {0xae0b158b4738705e, 0x9624ab50b148d446},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+      {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+      {0xd47487cc8470652b, 0x7647c32000696720},
+      {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+      {0xa5fb0a17c777cf09, 0xf468107100525891},
+      {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+      {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+      {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+      {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+      {0xfd442e4688bd304a, 0x908f4a166d1da664},
+      {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+      {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+      {0xf7549530e188c128, 0xd12bee59e68ef47d},
+      {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+      {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+      {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+      {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+      {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+      {0xebdf661791d60f56, 0x111b495b3464ad22},
+      {0x936b9fcebb25c995, 0xcab10dd900beec35},
+      {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+      {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+      {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+      {0xb3f4e093db73a093, 0x59ed216765690f57},
+      {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+      {0x8c974f7383725573, 0x1e414218c73a13fc},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+      {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+      {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+      {0xd686619ba27255a2, 0xc80a537b0efefebe},
+      {0x8613fd0145877585, 0xbd06742ce95f5f37},
+      {0xa798fc4196e952e7, 0x2c48113823b73705},
+      {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+      {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+      {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+      {0xcc963fee10b7d1b3, 0x318df905079926a9},
+      {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+      {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+      {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+      {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+      {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+      {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+      {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+      {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+      {0xbe89523386091465, 0xf6bbb397f1135824},
+      {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+      {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+      {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+      {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+      {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+      {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+      {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+      {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+      {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+      {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+      {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+      {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+      {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+      {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+      {0x843610cb4bf160cb, 0xcedf722a585139bb},
+      {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+      {0xce947a3da6a9273e, 0x733d226229feea33},
+      {0x811ccc668829b887, 0x0806357d5a3f5260},
+      {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+      {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+      {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+      {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+      {0xc5029163f384a931, 0x0a9e795e65d4df12},
+      {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+      {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+      {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+      {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+      {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+      {0xbbe226efb628afea, 0x890489f70a55368c},
+      {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+      {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+      {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+      {0xb32df8e9f3546564, 0x47939822dc96abfa},
+      {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+      {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+      {0xaefae51477a06b03, 0xede622920b6b23f2},
+      {0xdab99e59958885c4, 0xe95fab368e45ecee},
+      {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+      {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+      {0xd59944a37c0752a2, 0x4be76d3346f04960},
+      {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+      {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+      {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+      {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+      {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+      {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+      {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+      {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+      {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+      {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+      {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+      {0xc24452da229b021b, 0xfbe85badce996169},
+      {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+      {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+      {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+      {0xed246723473e3813, 0x290123e9aab23b69},
+      {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+      {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+      {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+      {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+      {0x8d590723948a535f, 0x579c487e5a38ad0f},
+      {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+      {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+      {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+      {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+      {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+      {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+      {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+      {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+      {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+      {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+      {0xcdb02555653131b6, 0x3792f412cb06794e},
+      {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+      {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+      {0xc8de047564d20a8b, 0xf245825a5a445276},
+      {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+      {0x9ced737bb6c4183d, 0x55464dd69685606c},
+      {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+      {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+      {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+      {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+      {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xbb127c53b17ec159, 0x5560c018580d5d53},
+      {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+      {0x9226712162ab070d, 0xcab3961304ca70e9},
+      {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+      {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+      {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+      {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+      {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+      {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+      {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+      {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+      {0x881cea14545c7575, 0x7e50d64177da2e55},
+      {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+      {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+      {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+      {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+      {0xcfb11ead453994ba, 0x67de18eda5814af3},
+      {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+      {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+      {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+      {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+      {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+      {0xc612062576589dda, 0x95364afe032a819e},
+      {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+      {0x9abe14cd44753b52, 0xc4926a9672793543},
+      {0xc16d9a0095928a27, 0x75b7053c0f178294},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+      {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+      {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+      {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+      {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+      {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+      {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+      {0xb424dc35095cd80f, 0x538484c19ef38c95},
+      {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+      {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+      {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+      {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+      {0x89705f4136b4a597, 0x31680a88f8953031},
+      {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+      {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+      {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+      {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+      {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+      {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+      {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+      {0xcccccccccccccccc, 0xcccccccccccccccd},
+      {0x8000000000000000, 0x0000000000000000},
+      {0xa000000000000000, 0x0000000000000000},
+      {0xc800000000000000, 0x0000000000000000},
+      {0xfa00000000000000, 0x0000000000000000},
+      {0x9c40000000000000, 0x0000000000000000},
+      {0xc350000000000000, 0x0000000000000000},
+      {0xf424000000000000, 0x0000000000000000},
+      {0x9896800000000000, 0x0000000000000000},
+      {0xbebc200000000000, 0x0000000000000000},
+      {0xee6b280000000000, 0x0000000000000000},
+      {0x9502f90000000000, 0x0000000000000000},
+      {0xba43b74000000000, 0x0000000000000000},
+      {0xe8d4a51000000000, 0x0000000000000000},
+      {0x9184e72a00000000, 0x0000000000000000},
+      {0xb5e620f480000000, 0x0000000000000000},
+      {0xe35fa931a0000000, 0x0000000000000000},
+      {0x8e1bc9bf04000000, 0x0000000000000000},
+      {0xb1a2bc2ec5000000, 0x0000000000000000},
+      {0xde0b6b3a76400000, 0x0000000000000000},
+      {0x8ac7230489e80000, 0x0000000000000000},
+      {0xad78ebc5ac620000, 0x0000000000000000},
+      {0xd8d726b7177a8000, 0x0000000000000000},
+      {0x878678326eac9000, 0x0000000000000000},
+      {0xa968163f0a57b400, 0x0000000000000000},
+      {0xd3c21bcecceda100, 0x0000000000000000},
+      {0x84595161401484a0, 0x0000000000000000},
+      {0xa56fa5b99019a5c8, 0x0000000000000000},
+      {0xcecb8f27f4200f3a, 0x0000000000000000},
+      {0x813f3978f8940984, 0x4000000000000000},
+      {0xa18f07d736b90be5, 0x5000000000000000},
+      {0xc9f2c9cd04674ede, 0xa400000000000000},
+      {0xfc6f7c4045812296, 0x4d00000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xc5371912364ce305, 0x6c28000000000000},
+      {0xf684df56c3e01bc6, 0xc732000000000000},
+      {0x9a130b963a6c115c, 0x3c7f400000000000},
+      {0xc097ce7bc90715b3, 0x4b9f100000000000},
+      {0xf0bdc21abb48db20, 0x1e86d40000000000},
+      {0x96769950b50d88f4, 0x1314448000000000},
+      {0xbc143fa4e250eb31, 0x17d955a000000000},
+      {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+      {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+      {0xb7abc627050305ad, 0xf14a3d9e40000000},
+      {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+      {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+      {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+      {0xe0352f62a19e306e, 0xd50b2037ad200000},
+      {0x8c213d9da502de45, 0x4526f422cc340000},
+      {0xaf298d050e4395d6, 0x9670b12b7f410000},
+      {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+      {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+      {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+      {0xd5d238a4abe98068, 0x72a4904598d6d880},
+      {0x85a36366eb71f041, 0x47a6da2b7f864750},
+      {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+      {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+      {0x82818f1281ed449f, 0xbff8f10e7a8921a4},
+      {0xa321f2d7226895c7, 0xaff72d52192b6a0d},
+      {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+      {0x9f4f2726179a2245, 0x01d762422c946590},
+      {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5},
+      {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2},
+      {0x9b934c3b330c8577, 0x63cc55f49f88eb2f},
+      {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb},
+      {0xf316271c7fc3908a, 0x8bef464e3945ef7a},
+      {0x97edd871cfda3a56, 0x97758bf0e3cbb5ac},
+      {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317},
+      {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd},
+      {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a},
+      {0xb975d6b6ee39e436, 0xb3e2fd538e122b44},
+      {0xe7d34c64a9c85d44, 0x60dbbca87196b616},
+      {0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd},
+      {0xb51d13aea4a488dd, 0x6babab6398bdbe41},
+      {0xe264589a4dcdab14, 0xc696963c7eed2dd1},
+      {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2},
+      {0xb0de65388cc8ada8, 0x3b25a55f43294bcb},
+      {0xdd15fe86affad912, 0x49ef0eb713f39ebe},
+      {0x8a2dbf142dfcc7ab, 0x6e3569326c784337},
+      {0xacb92ed9397bf996, 0x49c2c37f07965404},
+      {0xd7e77a8f87daf7fb, 0xdc33745ec97be906},
+      {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3},
+      {0xa8acd7c0222311bc, 0xc40832ea0d68ce0c},
+      {0xd2d80db02aabd62b, 0xf50a3fa490c30190},
+      {0x83c7088e1aab65db, 0x792667c6da79e0fa},
+      {0xa4b8cab1a1563f52, 0x577001b891185938},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+      {0x80b05e5ac60b6178, 0x544f8158315b05b4},
+      {0xa0dc75f1778e39d6, 0x696361ae3db1c721},
+      {0xc913936dd571c84c, 0x03bc3a19cd1e38e9},
+      {0xfb5878494ace3a5f, 0x04ab48a04065c723},
+      {0x9d174b2dcec0e47b, 0x62eb0d64283f9c76},
+      {0xc45d1df942711d9a, 0x3ba5d0bd324f8394},
+      {0xf5746577930d6500, 0xca8f44ec7ee36479},
+      {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb},
+      {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e},
+      {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e},
+      {0x95d04aee3b80ece5, 0xbba1f1d158724a12},
+      {0xbb445da9ca61281f, 0x2a8a6e45ae8edc97},
+      {0xea1575143cf97226, 0xf52d09d71a3293bd},
+      {0x924d692ca61be758, 0x593c2626705f9c56},
+      {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c},
+      {0xe498f455c38b997a, 0x0b6dfb9c0f956447},
+      {0x8edf98b59a373fec, 0x4724bd4189bd5eac},
+      {0xb2977ee300c50fe7, 0x58edec91ec2cb657},
+      {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed},
+      {0x8b865b215899f46c, 0xbd79e0d20082ee74},
+      {0xae67f1e9aec07187, 0xecd8590680a3aa11},
+      {0xda01ee641a708de9, 0xe80e6f4820cc9495},
+      {0x884134fe908658b2, 0x3109058d147fdcdd},
+      {0xaa51823e34a7eede, 0xbd4b46f0599fd415},
+      {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a},
+      {0x850fadc09923329e, 0x03e2cf6bc604ddb0},
+      {0xa6539930bf6bff45, 0x84db8346b786151c},
+      {0xcfe87f7cef46ff16, 0xe612641865679a63},
+      {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e},
+      {0xa26da3999aef7749, 0xe3be5e330f38f09d},
+      {0xcb090c8001ab551c, 0x5cadf5bfd3072cc5},
+      {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6},
+      {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa},
+      {0xc646d63501a1511d, 0xb281e1fd541501b8},
+      {0xf7d88bc24209a565, 0x1f225a7ca91a4226},
+      {0x9ae757596946075f, 0x3375788de9b06958},
+      {0xc1a12d2fc3978937, 0x0052d6b1641c83ae},
+      {0xf209787bb47d6b84, 0xc0678c5dbd23a49a},
+      {0x9745eb4d50ce6332, 0xf840b7ba963646e0},
+      {0xbd176620a501fbff, 0xb650e5a93bc3d898},
+      {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe},
+      {0x93ba47c980e98cdf, 0xc66f336c36b10137},
+      {0xb8a8d9bbe123f017, 0xb80b0047445d4184},
+      {0xe6d3102ad96cec1d, 0xa60dc059157491e5},
+      {0x9043ea1ac7e41392, 0x87c89837ad68db2f},
+      {0xb454e4a179dd1877, 0x29babe4598c311fb},
+      {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a},
+      {0x8ce2529e2734bb1d, 0x1899e4a65f58660c},
+      {0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f},
+      {0xdc21a1171d42645d, 0x76707543f4fa1f73},
+      {0x899504ae72497eba, 0x6a06494a791c53a8},
+      {0xabfa45da0edbde69, 0x0487db9d17636892},
+      {0xd6f8d7509292d603, 0x45a9d2845d3c42b6},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+      {0xa7f26836f282b732, 0x8e6cac7768d7141e},
+      {0xd1ef0244af2364ff, 0x3207d795430cd926},
+      {0x8335616aed761f1f, 0x7f44e6bd49e807b8},
+      {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6},
+      {0xcd036837130890a1, 0x36dba887c37a8c0f},
+      {0x802221226be55a64, 0xc2494954da2c9789},
+      {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c},
+      {0xc83553c5c8965d3d, 0x6f92829494e5acc7},
+      {0xfa42a8b73abbf48c, 0xcb772339ba1f17f9},
+      {0x9c69a97284b578d7, 0xff2a760414536efb},
+      {0xc38413cf25e2d70d, 0xfef5138519684aba},
+      {0xf46518c2ef5b8cd1, 0x7eb258665fc25d69},
+      {0x98bf2f79d5993802, 0xef2f773ffbd97a61},
+      {0xbeeefb584aff8603, 0xaafb550ffacfd8fa},
+      {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38},
+      {0x952ab45cfa97a0b2, 0xdd945a747bf26183},
+      {0xba756174393d88df, 0x94f971119aeef9e4},
+      {0xe912b9d1478ceb17, 0x7a37cd5601aab85d},
+      {0x91abb422ccb812ee, 0xac62e055c10ab33a},
+      {0xb616a12b7fe617aa, 0x577b986b314d6009},
+      {0xe39c49765fdf9d94, 0xed5a7e85fda0b80b},
+      {0x8e41ade9fbebc27d, 0x14588f13be847307},
+      {0xb1d219647ae6b31c, 0x596eb2d8ae258fc8},
+      {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb},
+      {0x8aec23d680043bee, 0x25de7bb9480d5854},
+      {0xada72ccc20054ae9, 0xaf561aa79a10ae6a},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+      {0x87aa9aff79042286, 0x90fb44d2f05d0842},
+      {0xa99541bf57452b28, 0x353a1607ac744a53},
+      {0xd3fa922f2d1675f2, 0x42889b8997915ce8},
+      {0x847c9b5d7c2e09b7, 0x69956135febada11},
+      {0xa59bc234db398c25, 0x43fab9837e699095},
+      {0xcf02b2c21207ef2e, 0x94f967e45e03f4bb},
+      {0x8161afb94b44f57d, 0x1d1be0eebac278f5},
+      {0xa1ba1ba79e1632dc, 0x6462d92a69731732},
+      {0xca28a291859bbf93, 0x7d7b8f7503cfdcfe},
+      {0xfcb2cb35e702af78, 0x5cda735244c3d43e},
+      {0x9defbf01b061adab, 0x3a0888136afa64a7},
+      {0xc56baec21c7a1916, 0x088aaa1845b8fdd0},
+      {0xf6c69a72a3989f5b, 0x8aad549e57273d45},
+      {0x9a3c2087a63f6399, 0x36ac54e2f678864b},
+      {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd},
+      {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5},
+      {0x969eb7c47859e743, 0x9f644ae5a4b1b325},
+      {0xbc4665b596706114, 0x873d5d9f0dde1fee},
+      {0xeb57ff22fc0c7959, 0xa90cb506d155a7ea},
+      {0x9316ff75dd87cbd8, 0x09a7f12442d588f2},
+      {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb2f},
+      {0xe5d3ef282a242e81, 0x8f1668c8a86da5fa},
+      {0x8fa475791a569d10, 0xf96e017d694487bc},
+      {0xb38d92d760ec4455, 0x37c981dcc395a9ac},
+      {0xe070f78d3927556a, 0x85bbe253f47b1417},
+      {0x8c469ab843b89562, 0x93956d7478ccec8e},
+      {0xaf58416654a6babb, 0x387ac8d1970027b2},
+      {0xdb2e51bfe9d0696a, 0x06997b05fcc0319e},
+      {0x88fcf317f22241e2, 0x441fece3bdf81f03},
+      {0xab3c2fddeeaad25a, 0xd527e81cad7626c3},
+      {0xd60b3bd56a5586f1, 0x8a71e223d8d3b074},
+      {0x85c7056562757456, 0xf6872d5667844e49},
+      {0xa738c6bebb12d16c, 0xb428f8ac016561db},
+      {0xd106f86e69d785c7, 0xe13336d701beba52},
+      {0x82a45b450226b39c, 0xecc0024661173473},
+      {0xa34d721642b06084, 0x27f002d7f95d0190},
+      {0xcc20ce9bd35c78a5, 0x31ec038df7b441f4},
+      {0xff290242c83396ce, 0x7e67047175a15271},
+      {0x9f79a169bd203e41, 0x0f0062c6e984d386},
+      {0xc75809c42c684dd1, 0x52c07b78a3e60868},
+      {0xf92e0c3537826145, 0xa7709a56ccdf8a82},
+      {0x9bbcc7a142b17ccb, 0x88a66076400bb691},
+      {0xc2abf989935ddbfe, 0x6acff893d00ea435},
+      {0xf356f7ebf83552fe, 0x0583f6b8c4124d43},
+      {0x98165af37b2153de, 0xc3727a337a8b704a},
+      {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c},
+      {0xeda2ee1c7064130c, 0x1162def06f79df73},
+      {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8},
+      {0xb9a74a0637ce2ee1, 0x6d953e2bd7173692},
+      {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437},
+      {0x910ab1d4db9914a0, 0x1d9c9892400a22a2},
+      {0xb54d5e4a127f59c8, 0x2503beb6d00cab4b},
+      {0xe2a0b5dc971f303a, 0x2e44ae64840fd61d},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+      {0xb10d8e1456105dad, 0x7425a83e872c5f47},
+      {0xdd50f1996b947518, 0xd12f124e28f77719},
+      {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f},
+      {0xace73cbfdc0bfb7b, 0x636cc64d1001550b},
+      {0xd8210befd30efa5a, 0x3c47f7e05401aa4e},
+      {0x8714a775e3e95c78, 0x65acfaec34810a71},
+      {0xa8d9d1535ce3b396, 0x7f1839a741a14d0d},
+      {0xd31045a8341ca07c, 0x1ede48111209a050},
+      {0x83ea2b892091e44d, 0x934aed0aab460432},
+      {0xa4e4b66b68b65d60, 0xf81da84d5617853f},
+      {0xce1de40642e3f4b9, 0x36251260ab9d668e},
+      {0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019},
+      {0xa1075a24e4421730, 0xb24cf65b8612f81f},
+      {0xc94930ae1d529cfc, 0xdee033f26797b627},
+      {0xfb9b7cd9a4a7443c, 0x169840ef017da3b1},
+      {0x9d412e0806e88aa5, 0x8e1f289560ee864e},
+      {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2},
+      {0xf5b5d7ec8acb58a2, 0xae10af696774b1db},
+      {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29},
+      {0xbff610b0cc6edd3f, 0x17fd090a58d32af3},
+      {0xeff394dcff8a948e, 0xddfc4b4cef07f5b0},
+      {0x95f83d0a1fb69cd9, 0x4abdaf101564f98e},
+      {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1},
+      {0xea53df5fd18d5513, 0x84c86189216dc5ed},
+      {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4},
+      {0xb7118682dbb66a77, 0x3fbc8c33221dc2a1},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+      {0x8f05b1163ba6832d, 0x29cb4d87f2a7400e},
+      {0xb2c71d5bca9023f8, 0x743e20e9ef511012},
+      {0xdf78e4b2bd342cf6, 0x914da9246b255416},
+      {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e},
+      {0xae9672aba3d0c320, 0xa184ac2473b529b1},
+      {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e},
+      {0x8865899617fb1871, 0x7e2fa67c7a658892},
+      {0xaa7eebfb9df9de8d, 0xddbb901b98feeab7},
+      {0xd51ea6fa85785631, 0x552a74227f3ea565},
+      {0x8533285c936b35de, 0xd53a88958f87275f},
+      {0xa67ff273b8460356, 0x8a892abaf368f137},
+      {0xd01fef10a657842c, 0x2d2b7569b0432d85},
+      {0x8213f56a67f6b29b, 0x9c3b29620e29fc73},
+      {0xa298f2c501f45f42, 0x8349f3ba91b47b8f},
+      {0xcb3f2f7642717713, 0x241c70a936219a73},
+      {0xfe0efb53d30dd4d7, 0xed238cd383aa0110},
+      {0x9ec95d1463e8a506, 0xf4363804324a40aa},
+      {0xc67bb4597ce2ce48, 0xb143c6053edcd0d5},
+      {0xf81aa16fdc1b81da, 0xdd94b7868e94050a},
+      {0x9b10a4e5e9913128, 0xca7cf2b4191c8326},
+      {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0},
+      {0xf24a01a73cf2dccf, 0xbc633b39673c8cec},
+      {0x976e41088617ca01, 0xd5be0503e085d813},
+      {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18},
+      {0xec9c459d51852ba2, 0xddf8e7d60ed1219e},
+      {0x93e1ab8252f33b45, 0xcabb90e5c942b503},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+      {0xe7109bfba19c0c9d, 0x0cc512670a783ad4},
+      {0x906a617d450187e2, 0x27fb2b80668b24c5},
+      {0xb484f9dc9641e9da, 0xb1f9f660802dedf6},
+      {0xe1a63853bbd26451, 0x5e7873f8a0396973},
+      {0x8d07e33455637eb2, 0xdb0b487b6423e1e8},
+      {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62},
+      {0xdc5c5301c56b75f7, 0x7641a140cc7810fb},
+      {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d},
+      {0xac2820d9623bf429, 0x546345fa9fbdcd44},
+      {0xd732290fbacaf133, 0xa97c177947ad4095},
+      {0x867f59a9d4bed6c0, 0x49ed8eabcccc485d},
+      {0xa81f301449ee8c70, 0x5c68f256bfff5a74},
+      {0xd226fc195c6a2f8c, 0x73832eec6fff3111},
+      {0x83585d8fd9c25db7, 0xc831fd53c5ff7eab},
+      {0xa42e74f3d032f525, 0xba3e7ca8b77f5e55},
+      {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb},
+      {0x80444b5e7aa7cf85, 0x7980d163cf5b81b3},
+      {0xa0555e361951c366, 0xd7e105bcc332621f},
+      {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7},
+      {0xfa856334878fc150, 0xb14f98f6f0feb951},
+      {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3},
+      {0xc3b8358109e84f07, 0x0a862f80ec4700c8},
+      {0xf4a642e14c6262c8, 0xcd27bb612758c0fa},
+      {0x98e7e9cccfbd7dbd, 0x8038d51cb897789c},
+      {0xbf21e44003acdd2c, 0xe0470a63e6bd56c3},
+      {0xeeea5d5004981478, 0x1858ccfce06cac74},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc8},
+      {0xbaa718e68396cffd, 0xd30560258f54e6ba},
+      {0xe950df20247c83fd, 0x47c6b82ef32a2069},
+      {0x91d28b7416cdd27e, 0x4cdc331d57fa5441},
+      {0xb6472e511c81471d, 0xe0133fe4adf8e952},
+      {0xe3d8f9e563a198e5, 0x58180fddd97723a6},
+      {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648},
+      {0xb201833b35d63f73, 0x2cd2cc6551e513da},
+      {0xde81e40a034bcf4f, 0xf8077f7ea65e58d1},
+      {0x8b112e86420f6191, 0xfb04afaf27faf782},
+      {0xadd57a27d29339f6, 0x79c5db9af1f9b563},
+      {0xd94ad8b1c7380874, 0x18375281ae7822bc},
+      {0x87cec76f1c830548, 0x8f2293910d0b15b5},
+      {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb22},
+      {0xd433179d9c8cb841, 0x5fa60692a46151eb},
+      {0x849feec281d7f328, 0xdbc7c41ba6bcd333},
+      {0xa5c7ea73224deff3, 0x12b9b522906c0800},
+      {0xcf39e50feae16bef, 0xd768226b34870a00},
+      {0x81842f29f2cce375, 0xe6a1158300d46640},
+      {0xa1e53af46f801c53, 0x60495ae3c1097fd0},
+      {0xca5e89b18b602368, 0x385bb19cb14bdfc4},
+      {0xfcf62c1dee382c42, 0x46729e03dd9ed7b5},
+      {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d1},
+      {0xc5a05277621be293, 0xc7098b7305241885},
+      { 0xf70867153aa2db38,
+        0xb8cbee4fc66d1ea7 }
 #else
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0xc350000000000000, 0x0000000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+      {0xa6539930bf6bff45, 0x84db8346b786151c},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+      {0xaf58416654a6babb, 0x387ac8d1970027b2},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+      { 0x95527a5202df0ccb,
+        0x0f37801e0c43ebc8 }
+#endif
+    };
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return pow10_significands[k - float_info<double>::min_k];
+#else
+    static constexpr const uint64_t powers_of_5_64[] = {
+        0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+        0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+        0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+        0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+        0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+        0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+        0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+        0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+        0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+    static constexpr const uint32_t pow10_recovery_errors[] = {
+        0x50001400, 0x54044100, 0x54014555, 0x55954415, 0x54115555, 0x00000001,
+        0x50000000, 0x00104000, 0x54010004, 0x05004001, 0x55555544, 0x41545555,
+        0x54040551, 0x15445545, 0x51555514, 0x10000015, 0x00101100, 0x01100015,
+        0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04450514, 0x45414110,
+        0x55555145, 0x50544050, 0x15040155, 0x11054140, 0x50111514, 0x11451454,
+        0x00400541, 0x00000000, 0x55555450, 0x10056551, 0x10054011, 0x55551014,
+        0x69514555, 0x05151109, 0x00155555};
+
     static const int compression_ratio = 27;
 
     // Compute base index.
@@ -1897,8 +1833,7 @@ template <> struct cache_accessor<double> {
     int offset = k - kb;
 
     // Get base cache.
-    uint128_wrapper base_cache =
-        data::dragonbox_pow10_significands_128[cache_index];
+    uint128_wrapper base_cache = pow10_significands[cache_index];
     if (offset == 0) return base_cache;
 
     // Compute the required amount of bit-shift.
@@ -1906,7 +1841,7 @@ template <> struct cache_accessor<double> {
     FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
 
     // Try to recover the real cache.
-    uint64_t pow5 = data::powers_of_5_64[offset];
+    uint64_t pow5 = powers_of_5_64[offset];
     uint128_wrapper recovered_cache = umul128(base_cache.high(), pow5);
     uint128_wrapper middle_low =
         umul128(base_cache.low() - (kb < 0 ? 1u : 0u), pow5);
@@ -1924,7 +1859,7 @@ template <> struct cache_accessor<double> {
 
     // Get error.
     int error_idx = (k - float_info<double>::min_k) / 16;
-    uint32_t error = (data::dragonbox_pow10_recovery_errors[error_idx] >>
+    uint32_t error = (pow10_recovery_errors[error_idx] >>
                       ((k - float_info<double>::min_k) % 16) * 2) &
                      0x3;
 
@@ -2010,7 +1945,7 @@ bool is_center_integer(typename float_info<T>::carrier_uint two_f, int exponent,
 }
 
 // Remove trailing zeros from n and return the number of zeros removed (float)
-FMT_ALWAYS_INLINE int remove_trailing_zeros(uint32_t& n) FMT_NOEXCEPT {
+FMT_INLINE int remove_trailing_zeros(uint32_t& n) FMT_NOEXCEPT {
 #ifdef FMT_BUILTIN_CTZ
   int t = FMT_BUILTIN_CTZ(n);
 #else
@@ -2038,7 +1973,7 @@ FMT_ALWAYS_INLINE int remove_trailing_zeros(uint32_t& n) FMT_NOEXCEPT {
 }
 
 // Removes trailing zeros and returns the number of zeros removed (double)
-FMT_ALWAYS_INLINE int remove_trailing_zeros(uint64_t& n) FMT_NOEXCEPT {
+FMT_INLINE int remove_trailing_zeros(uint64_t& n) FMT_NOEXCEPT {
 #ifdef FMT_BUILTIN_CTZLL
   int t = FMT_BUILTIN_CTZLL(n);
 #else
@@ -2124,8 +2059,7 @@ FMT_ALWAYS_INLINE int remove_trailing_zeros(uint64_t& n) FMT_NOEXCEPT {
 
 // The main algorithm for shorter interval case
 template <class T>
-FMT_ALWAYS_INLINE FMT_SAFEBUFFERS decimal_fp<T> shorter_interval_case(
-    int exponent) FMT_NOEXCEPT {
+FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) FMT_NOEXCEPT {
   decimal_fp<T> ret_value;
   // Compute k and beta
   const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
@@ -2171,8 +2105,7 @@ FMT_ALWAYS_INLINE FMT_SAFEBUFFERS decimal_fp<T> shorter_interval_case(
   return ret_value;
 }
 
-template <typename T>
-FMT_SAFEBUFFERS decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT {
+template <typename T> decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT {
   // Step 1: integer promotion & Schubfach multiplier calculation.
 
   using carrier_uint = typename float_info<T>::carrier_uint;
@@ -2306,24 +2239,21 @@ small_divisor_case_label:
 }
 }  // namespace dragonbox
 
-// Formats value using a variation of the Fixed-Precision Positive
-// Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
-// https://fmt.dev/p372-steele.pdf.
-template <typename Double>
-void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
-                     int& exp10) {
+// Formats a floating-point number using a variation of the Fixed-Precision
+// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/papers/p372-steele.pdf.
+FMT_CONSTEXPR20 inline void format_dragon(fp value, bool is_predecessor_closer,
+                                          int num_digits, buffer<char>& buf,
+                                          int& exp10) {
   bigint numerator;    // 2 * R in (FPP)^2.
   bigint denominator;  // 2 * S in (FPP)^2.
   // lower and upper are differences between value and corresponding boundaries.
   bigint lower;             // (M^- in (FPP)^2).
   bigint upper_store;       // upper's value if different from lower.
   bigint* upper = nullptr;  // (M^+ in (FPP)^2).
-  fp value;
   // Shift numerator and denominator by an extra bit or two (if lower boundary
   // is closer) to make lower and upper integers. This eliminates multiplication
   // by 2 during later computations.
-  const bool is_predecessor_closer =
-      binary32 ? value.assign(static_cast<float>(d)) : value.assign(d);
   int shift = is_predecessor_closer ? 2 : 1;
   uint64_t significand = value.f << shift;
   if (value.e >= 0) {
@@ -2393,9 +2323,9 @@ void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
   // Generate the given number of digits.
   exp10 -= num_digits - 1;
   if (num_digits == 0) {
-    buf.try_resize(1);
     denominator *= 10;
-    buf[0] = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    buf.push_back(digit);
     return;
   }
   buf.try_resize(to_unsigned(num_digits));
@@ -2426,9 +2356,12 @@ void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
   buf[num_digits - 1] = static_cast<char>('0' + digit);
 }
 
-template <typename T>
-int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
-  static_assert(!std::is_same<T, float>::value, "");
+template <typename Float>
+FMT_HEADER_ONLY_CONSTEXPR20 int format_float(Float value, int precision,
+                                             float_specs specs,
+                                             buffer<char>& buf) {
+  // float is passed as double to reduce the number of instantiations.
+  static_assert(!std::is_same<Float, float>::value, "");
   FMT_ASSERT(value >= 0, "value is negative");
 
   const bool fixed = specs.format == float_format::fixed;
@@ -2438,13 +2371,13 @@ int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
       return 0;
     }
     buf.try_resize(to_unsigned(precision));
-    std::uninitialized_fill_n(buf.data(), precision, '0');
+    fill_n(buf.data(), precision, '0');
     return -precision;
   }
 
-  if (!specs.use_grisu) return snprintf_float(value, precision, specs, buf);
+  if (specs.fallback) return snprintf_float(value, precision, specs, buf);
 
-  if (precision < 0) {
+  if (!is_constant_evaluated() && precision < 0) {
     // Use Dragonbox for the shortest format.
     if (specs.binary32) {
       auto dec = dragonbox::to_decimal(static_cast<float>(value));
@@ -2456,26 +2389,37 @@ int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
     return dec.exponent;
   }
 
-  // Use Grisu + Dragon4 for the given precision:
-  // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
   int exp = 0;
-  const int min_exp = -60;  // alpha in Grisu.
-  int cached_exp10 = 0;     // K in Grisu.
-  fp normalized = normalize(fp(value));
-  const auto cached_pow = get_cached_power(
-      min_exp - (normalized.e + fp::significand_size), cached_exp10);
-  normalized = normalized * cached_pow;
-  // Limit precision to the maximum possible number of significant digits in an
-  // IEEE754 double because we don't need to generate zeros.
-  const int max_double_digits = 767;
-  if (precision > max_double_digits) precision = max_double_digits;
-  fixed_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
-  if (grisu_gen_digits(normalized, 1, exp, handler) == digits::error) {
-    exp += handler.size - cached_exp10 - 1;
-    fallback_format(value, handler.precision, specs.binary32, buf, exp);
-  } else {
-    exp += handler.exp10;
-    buf.try_resize(to_unsigned(handler.size));
+  bool use_dragon = true;
+  if (is_fast_float<Float>()) {
+    // Use Grisu + Dragon4 for the given precision:
+    // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
+    const int min_exp = -60;  // alpha in Grisu.
+    int cached_exp10 = 0;     // K in Grisu.
+    fp normalized = normalize(fp(value));
+    const auto cached_pow = get_cached_power(
+        min_exp - (normalized.e + fp::num_significand_bits), cached_exp10);
+    normalized = normalized * cached_pow;
+    gen_digits_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
+    if (grisu_gen_digits(normalized, 1, exp, handler) != digits::error &&
+        !is_constant_evaluated()) {
+      exp += handler.exp10;
+      buf.try_resize(to_unsigned(handler.size));
+      use_dragon = false;
+    } else {
+      exp += handler.size - cached_exp10 - 1;
+      precision = handler.precision;
+    }
+  }
+  if (use_dragon) {
+    auto f = fp();
+    bool is_predecessor_closer =
+        specs.binary32 ? f.assign(static_cast<float>(value)) : f.assign(value);
+    // Limit precision to the maximum possible number of significant digits in
+    // an IEEE754 double because we don't need to generate zeros.
+    const int max_double_digits = 767;
+    if (precision > max_double_digits) precision = max_double_digits;
+    format_dragon(f, is_predecessor_closer, precision, buf, exp);
   }
   if (!fixed && !specs.showpoint) {
     // Remove trailing zeros.
@@ -2487,7 +2431,7 @@ int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
     buf.try_resize(num_digits);
   }
   return exp;
-}  // namespace detail
+}
 
 template <typename T>
 int snprintf_float(T value, int precision, float_specs specs,
@@ -2571,11 +2515,11 @@ int snprintf_float(T value, int precision, float_specs specs,
       --exp_pos;
     } while (*exp_pos != 'e');
     char sign = exp_pos[1];
-    assert(sign == '+' || sign == '-');
+    FMT_ASSERT(sign == '+' || sign == '-', "");
     int exp = 0;
     auto p = exp_pos + 2;  // Skip 'e' and sign.
     do {
-      assert(is_digit(*p));
+      FMT_ASSERT(is_digit(*p), "");
       exp = exp * 10 + (*p++ - '0');
     } while (p != end);
     if (sign == '-') exp = -exp;
@@ -2592,71 +2536,11 @@ int snprintf_float(T value, int precision, float_specs specs,
     return exp - fraction_size;
   }
 }
-
-// A public domain branchless UTF-8 decoder by Christopher Wellons:
-// https://github.com/skeeto/branchless-utf8
-/* Decode the next character, c, from buf, reporting errors in e.
- *
- * Since this is a branchless decoder, four bytes will be read from the
- * buffer regardless of the actual length of the next character. This
- * means the buffer _must_ have at least three bytes of zero padding
- * following the end of the data stream.
- *
- * Errors are reported in e, which will be non-zero if the parsed
- * character was somehow invalid: invalid byte sequence, non-canonical
- * encoding, or a surrogate half.
- *
- * The function returns a pointer to the next character. When an error
- * occurs, this pointer will be a guess that depends on the particular
- * error, but it will always advance at least one byte.
- */
-inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
-  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
-  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
-  static const int shiftc[] = {0, 18, 12, 6, 0};
-  static const int shifte[] = {0, 6, 4, 2, 0};
-
-  int len = code_point_length(buf);
-  const char* next = buf + len;
-
-  // Assume a four-byte character and load four bytes. Unused bits are
-  // shifted out.
-  auto s = reinterpret_cast<const unsigned char*>(buf);
-  *c = uint32_t(s[0] & masks[len]) << 18;
-  *c |= uint32_t(s[1] & 0x3f) << 12;
-  *c |= uint32_t(s[2] & 0x3f) << 6;
-  *c |= uint32_t(s[3] & 0x3f) << 0;
-  *c >>= shiftc[len];
-
-  // Accumulate the various error conditions.
-  *e = (*c < mins[len]) << 6;       // non-canonical encoding
-  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
-  *e |= (*c > 0x10FFFF) << 8;       // out of range?
-  *e |= (s[1] & 0xc0) >> 2;
-  *e |= (s[2] & 0xc0) >> 4;
-  *e |= (s[3]) >> 6;
-  *e ^= 0x2a;  // top two bits of each tail byte correct?
-  *e >>= shifte[len];
-
-  return next;
-}
-
-struct stringifier {
-  template <typename T> FMT_INLINE std::string operator()(T value) const {
-    return to_string(value);
-  }
-  std::string operator()(basic_format_arg<format_context>::handle h) const {
-    memory_buffer buf;
-    format_parse_context parse_ctx({});
-    format_context format_ctx(buffer_appender<char>(buf), {}, {});
-    h.format(parse_ctx, format_ctx);
-    return to_string(buf);
-  }
-};
 }  // namespace detail
 
 template <> struct formatter<detail::bigint> {
-  format_parse_context::iterator parse(format_parse_context& ctx) {
+  FMT_CONSTEXPR format_parse_context::iterator parse(
+      format_parse_context& ctx) {
     return ctx.begin();
   }
 
@@ -2667,24 +2551,22 @@ template <> struct formatter<detail::bigint> {
     for (auto i = n.bigits_.size(); i > 0; --i) {
       auto value = n.bigits_[i - 1u];
       if (first) {
-        out = format_to(out, "{:x}", value);
+        out = format_to(out, FMT_STRING("{:x}"), value);
         first = false;
         continue;
       }
-      out = format_to(out, "{:08x}", value);
+      out = format_to(out, FMT_STRING("{:08x}"), value);
     }
     if (n.exp_ > 0)
-      out = format_to(out, "p{}", n.exp_ * detail::bigint::bigit_bits);
+      out = format_to(out, FMT_STRING("p{}"),
+                      n.exp_ * detail::bigint::bigit_bits);
     return out;
   }
 };
 
 FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
-  auto transcode = [this](const char* p) {
-    auto cp = uint32_t();
-    auto error = 0;
-    p = utf8_decode(p, &cp, &error);
-    if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
+  for_each_codepoint(s, [this](uint32_t cp, string_view) {
+    if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
     if (cp <= 0xFFFF) {
       buffer_.push_back(static_cast<wchar_t>(cp));
     } else {
@@ -2692,64 +2574,38 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
       buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
       buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
     }
-    return p;
-  };
-  auto p = s.data();
-  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
-  if (s.size() >= block_size) {
-    for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p);
-  }
-  if (auto num_chars_left = s.data() + s.size() - p) {
-    char buf[2 * block_size - 1] = {};
-    memcpy(buf, p, to_unsigned(num_chars_left));
-    p = buf;
-    do {
-      p = transcode(p);
-    } while (p - buf < num_chars_left);
-  }
+    return true;
+  });
   buffer_.push_back(0);
 }
 
 FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
-                                  string_view message) FMT_NOEXCEPT {
+                                  const char* message) FMT_NOEXCEPT {
   FMT_TRY {
-    memory_buffer buf;
-    buf.resize(inline_buffer_size);
-    for (;;) {
-      char* system_message = &buf[0];
-      int result =
-          detail::safe_strerror(error_code, system_message, buf.size());
-      if (result == 0) {
-        format_to(detail::buffer_appender<char>(out), "{}: {}", message,
-                  system_message);
-        return;
-      }
-      if (result != ERANGE)
-        break;  // Can't get error message, report error code instead.
-      buf.resize(buf.size() * 2);
-    }
+    auto ec = std::error_code(error_code, std::generic_category());
+    write(std::back_inserter(out), std::system_error(ec, message).what());
+    return;
   }
   FMT_CATCH(...) {}
   format_error_code(out, error_code, message);
 }
 
-FMT_FUNC void detail::error_handler::on_error(const char* message) {
-  FMT_THROW(format_error(message));
-}
-
 FMT_FUNC void report_system_error(int error_code,
-                                  fmt::string_view message) FMT_NOEXCEPT {
+                                  const char* message) FMT_NOEXCEPT {
   report_error(format_system_error, error_code, message);
 }
 
-FMT_FUNC std::string detail::vformat(string_view format_str, format_args args) {
-  if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
-    auto arg = args.get(0);
-    if (!arg) error_handler().on_error("argument not found");
-    return visit_format_arg(stringifier(), arg);
-  }
-  memory_buffer buffer;
-  detail::vformat_to(buffer, format_str, args);
+// DEPRECATED!
+// This function is defined here and not inline for ABI compatiblity.
+FMT_FUNC void detail::error_handler::on_error(const char* message) {
+  throw_format_error(message);
+}
+
+FMT_FUNC std::string vformat(string_view fmt, format_args args) {
+  // Don't optimize the "{}" case to keep the binary size small and because it
+  // can be better optimized in fmt::format anyway.
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
   return to_string(buffer);
 }
 
@@ -2761,24 +2617,30 @@ extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
 }  // namespace detail
 #endif
 
-FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
-  memory_buffer buffer;
-  detail::vformat_to(buffer, format_str,
-                     basic_format_args<buffer_context<char>>(args));
+namespace detail {
+FMT_FUNC void print(std::FILE* f, string_view text) {
 #ifdef _WIN32
   auto fd = _fileno(f);
   if (_isatty(fd)) {
-    detail::utf8_to_utf16 u16(string_view(buffer.data(), buffer.size()));
+    detail::utf8_to_utf16 u16(string_view(text.data(), text.size()));
     auto written = detail::dword();
-    if (!detail::WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)),
-                               u16.c_str(), static_cast<uint32_t>(u16.size()),
-                               &written, nullptr)) {
-      FMT_THROW(format_error("failed to write to console"));
+    if (detail::WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)),
+                              u16.c_str(), static_cast<uint32_t>(u16.size()),
+                              &written, nullptr)) {
+      return;
     }
-    return;
+    // Fallback to fwrite on failure. It can happen if the output has been
+    // redirected to NUL.
   }
 #endif
-  detail::fwrite_fully(buffer.data(), 1, buffer.size(), f);
+  detail::fwrite_fully(text.data(), 1, text.size(), f);
+}
+}  // namespace detail
+
+FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
+  memory_buffer buffer;
+  detail::vformat_to(buffer, format_str, args);
+  detail::print(f, {buffer.data(), buffer.size()});
 }
 
 #ifdef _WIN32
diff --git a/src/fmt/format.h b/src/fmt/format.h
index 1a037b02b7..ee69651ca5 100644
--- a/src/fmt/format.h
+++ b/src/fmt/format.h
@@ -33,22 +33,24 @@
 #ifndef FMT_FORMAT_H_
 #define FMT_FORMAT_H_
 
-#include <algorithm>
-#include <cerrno>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <stdexcept>
+#include <cmath>         // std::signbit
+#include <cstdint>       // uint32_t
+#include <limits>        // std::numeric_limits
+#include <memory>        // std::uninitialized_copy
+#include <stdexcept>     // std::runtime_error
+#include <system_error>  // std::system_error
+#include <utility>       // std::swap
+
+#ifdef __cpp_lib_bit_cast
+#  include <bit>  // std::bitcast
+#endif
 
 #include "core.h"
 
-#ifdef __INTEL_COMPILER
-#  define FMT_ICC_VERSION __INTEL_COMPILER
-#elif defined(__ICL)
-#  define FMT_ICC_VERSION __ICL
+#if FMT_GCC_VERSION
+#  define FMT_GCC_VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
 #else
-#  define FMT_ICC_VERSION 0
+#  define FMT_GCC_VISIBILITY_HIDDEN
 #endif
 
 #ifdef __NVCC__
@@ -69,30 +71,10 @@
 #  define FMT_NOINLINE
 #endif
 
-#if __cplusplus == 201103L || __cplusplus == 201402L
-#  if defined(__INTEL_COMPILER) || defined(__PGI)
-#    define FMT_FALLTHROUGH
-#  elif defined(__clang__)
-#    define FMT_FALLTHROUGH [[clang::fallthrough]]
-#  elif FMT_GCC_VERSION >= 700 && \
-      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
-#    define FMT_FALLTHROUGH [[gnu::fallthrough]]
-#  else
-#    define FMT_FALLTHROUGH
-#  endif
-#elif FMT_HAS_CPP17_ATTRIBUTE(fallthrough) || \
-    (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
-#  define FMT_FALLTHROUGH [[fallthrough]]
+#if FMT_MSC_VER
+#  define FMT_MSC_DEFAULT = default
 #else
-#  define FMT_FALLTHROUGH
-#endif
-
-#ifndef FMT_MAYBE_UNUSED
-#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
-#    define FMT_MAYBE_UNUSED [[maybe_unused]]
-#  else
-#    define FMT_MAYBE_UNUSED
-#  endif
+#  define FMT_MSC_DEFAULT
 #endif
 
 #ifndef FMT_THROW
@@ -113,10 +95,9 @@ FMT_END_NAMESPACE
 #      define FMT_THROW(x) throw x
 #    endif
 #  else
-#    define FMT_THROW(x)              \
-      do {                            \
-        static_cast<void>(sizeof(x)); \
-        FMT_ASSERT(false, "");        \
+#    define FMT_THROW(x)               \
+      do {                             \
+        FMT_ASSERT(false, (x).what()); \
       } while (false)
 #  endif
 #endif
@@ -129,6 +110,21 @@ FMT_END_NAMESPACE
 #  define FMT_CATCH(x) if (false)
 #endif
 
+#ifndef FMT_MAYBE_UNUSED
+#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
+#    define FMT_MAYBE_UNUSED [[maybe_unused]]
+#  else
+#    define FMT_MAYBE_UNUSED
+#  endif
+#endif
+
+// Workaround broken [[deprecated]] in the Intel, PGI and NVCC compilers.
+#if FMT_ICC_VERSION || defined(__PGI) || FMT_NVCC
+#  define FMT_DEPRECATED_ALIAS
+#else
+#  define FMT_DEPRECATED_ALIAS FMT_DEPRECATED
+#endif
+
 #ifndef FMT_USE_USER_DEFINED_LITERALS
 // EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
 #  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 || \
@@ -140,53 +136,34 @@ FMT_END_NAMESPACE
 #  endif
 #endif
 
-#ifndef FMT_USE_UDL_TEMPLATE
-// EDG frontend based compilers (icc, nvcc, PGI, etc) and GCC < 6.4 do not
-// properly support UDL templates and GCC >= 9 warns about them.
-#  if FMT_USE_USER_DEFINED_LITERALS &&                         \
-      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 501) && \
-      ((FMT_GCC_VERSION >= 604 && __cplusplus >= 201402L) ||   \
-       FMT_CLANG_VERSION >= 304) &&                            \
-      !defined(__PGI) && !defined(__NVCC__)
-#    define FMT_USE_UDL_TEMPLATE 1
-#  else
-#    define FMT_USE_UDL_TEMPLATE 0
-#  endif
-#endif
-
-#ifndef FMT_USE_FLOAT
-#  define FMT_USE_FLOAT 1
-#endif
-
-#ifndef FMT_USE_DOUBLE
-#  define FMT_USE_DOUBLE 1
-#endif
-
-#ifndef FMT_USE_LONG_DOUBLE
-#  define FMT_USE_LONG_DOUBLE 1
-#endif
-
 // Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
-// int_writer template instances to just one by only using the largest integer
-// type. This results in a reduction in binary size but will cause a decrease in
-// integer formatting performance.
+// integer formatter template instantiations to just one by only using the
+// largest integer type. This results in a reduction in binary size but will
+// cause a decrease in integer formatting performance.
 #if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
 #  define FMT_REDUCE_INT_INSTANTIATIONS 0
 #endif
 
 // __builtin_clz is broken in clang with Microsoft CodeGen:
-// https://github.com/fmtlib/fmt/issues/519
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clz)) && !FMT_MSC_VER
-#  define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+// https://github.com/fmtlib/fmt/issues/519.
+#if !FMT_MSC_VER
+#  if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#  endif
 #endif
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clzll)) && !FMT_MSC_VER
-#  define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
-#endif
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctz))
-#  define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
-#endif
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctzll))
-#  define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+
+// __builtin_ctz is broken in Intel Compiler Classic on Windows:
+// https://github.com/fmtlib/fmt/issues/2510.
+#ifndef __ICL
+#  if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#  endif
 #endif
 
 #if FMT_MSC_VER
@@ -196,33 +173,32 @@ FMT_END_NAMESPACE
 // Some compilers masquerade as both MSVC and GCC-likes or otherwise support
 // __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
 // MSVC intrinsics if the clz and clzll builtins are not available.
-#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && \
-    !defined(FMT_BUILTIN_CTZLL) && !defined(_MANAGED)
+#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && !defined(FMT_BUILTIN_CTZLL)
 FMT_BEGIN_NAMESPACE
 namespace detail {
 // Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
-#  ifndef __clang__
+#  if !defined(__clang__)
 #    pragma intrinsic(_BitScanForward)
 #    pragma intrinsic(_BitScanReverse)
-#  endif
-#  if defined(_WIN64) && !defined(__clang__)
-#    pragma intrinsic(_BitScanForward64)
-#    pragma intrinsic(_BitScanReverse64)
+#    if defined(_WIN64)
+#      pragma intrinsic(_BitScanForward64)
+#      pragma intrinsic(_BitScanReverse64)
+#    endif
 #  endif
 
-inline int clz(uint32_t x) {
+inline auto clz(uint32_t x) -> int {
   unsigned long r = 0;
   _BitScanReverse(&r, x);
   FMT_ASSERT(x != 0, "");
   // Static analysis complains about using uninitialized data
   // "r", but the only way that can happen is if "x" is 0,
   // which the callers guarantee to not happen.
-  FMT_SUPPRESS_MSC_WARNING(6102)
+  FMT_MSC_WARNING(suppress : 6102)
   return 31 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZ(n) detail::clz(n)
 
-inline int clzll(uint64_t x) {
+inline auto clzll(uint64_t x) -> int {
   unsigned long r = 0;
 #  ifdef _WIN64
   _BitScanReverse64(&r, x);
@@ -233,24 +209,24 @@ inline int clzll(uint64_t x) {
   _BitScanReverse(&r, static_cast<uint32_t>(x));
 #  endif
   FMT_ASSERT(x != 0, "");
-  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
   return 63 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
 
-inline int ctz(uint32_t x) {
+inline auto ctz(uint32_t x) -> int {
   unsigned long r = 0;
   _BitScanForward(&r, x);
   FMT_ASSERT(x != 0, "");
-  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
   return static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
 
-inline int ctzll(uint64_t x) {
+inline auto ctzll(uint64_t x) -> int {
   unsigned long r = 0;
   FMT_ASSERT(x != 0, "");
-  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
 #  ifdef _WIN64
   _BitScanForward64(&r, x);
 #  else
@@ -267,31 +243,71 @@ inline int ctzll(uint64_t x) {
 FMT_END_NAMESPACE
 #endif
 
-// Enable the deprecated numeric alignment.
-#ifndef FMT_DEPRECATED_NUMERIC_ALIGN
-#  define FMT_DEPRECATED_NUMERIC_ALIGN 0
+#ifdef FMT_HEADER_ONLY
+#  define FMT_HEADER_ONLY_CONSTEXPR20 FMT_CONSTEXPR20
+#else
+#  define FMT_HEADER_ONLY_CONSTEXPR20
 #endif
 
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
-// An equivalent of `*reinterpret_cast<Dest*>(&source)` that doesn't have
-// undefined behavior (e.g. due to type aliasing).
-// Example: uint64_t d = bit_cast<uint64_t>(2.718);
-template <typename Dest, typename Source>
-inline Dest bit_cast(const Source& source) {
-  static_assert(sizeof(Dest) == sizeof(Source), "size mismatch");
-  Dest dest;
-  std::memcpy(&dest, &source, sizeof(dest));
-  return dest;
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
+// Implementation of std::bit_cast for pre-C++20.
+template <typename To, typename From>
+FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
+  static_assert(sizeof(To) == sizeof(From), "size mismatch");
+#ifdef __cpp_lib_bit_cast
+  if (is_constant_evaluated()) return std::bit_cast<To>(from);
+#endif
+  auto to = To();
+  std::memcpy(&to, &from, sizeof(to));
+  return to;
 }
 
-inline bool is_big_endian() {
-  const auto u = 1u;
+inline auto is_big_endian() -> bool {
+#ifdef _WIN32
+  return false;
+#elif defined(__BIG_ENDIAN__)
+  return true;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+  return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__;
+#else
   struct bytes {
-    char data[sizeof(u)];
+    char data[sizeof(int)];
   };
-  return bit_cast<bytes>(u).data[0] == 0;
+  return bit_cast<bytes>(1).data[0] == 0;
+#endif
 }
 
 // A fallback implementation of uintptr_t for systems that lack it.
@@ -301,7 +317,7 @@ struct fallback_uintptr {
   fallback_uintptr() = default;
   explicit fallback_uintptr(const void* p) {
     *this = bit_cast<fallback_uintptr>(p);
-    if (is_big_endian()) {
+    if (const_check(is_big_endian())) {
       for (size_t i = 0, j = sizeof(void*) - 1; i < j; ++i, --j)
         std::swap(value[i], value[j]);
     }
@@ -309,26 +325,28 @@ struct fallback_uintptr {
 };
 #ifdef UINTPTR_MAX
 using uintptr_t = ::uintptr_t;
-inline uintptr_t to_uintptr(const void* p) { return bit_cast<uintptr_t>(p); }
+inline auto to_uintptr(const void* p) -> uintptr_t {
+  return bit_cast<uintptr_t>(p);
+}
 #else
 using uintptr_t = fallback_uintptr;
-inline fallback_uintptr to_uintptr(const void* p) {
+inline auto to_uintptr(const void* p) -> fallback_uintptr {
   return fallback_uintptr(p);
 }
 #endif
 
 // Returns the largest possible value for type T. Same as
 // std::numeric_limits<T>::max() but shorter and not affected by the max macro.
-template <typename T> constexpr T max_value() {
+template <typename T> constexpr auto max_value() -> T {
   return (std::numeric_limits<T>::max)();
 }
-template <typename T> constexpr int num_bits() {
+template <typename T> constexpr auto num_bits() -> int {
   return std::numeric_limits<T>::digits;
 }
 // std::numeric_limits<T>::digits may return 0 for 128-bit ints.
-template <> constexpr int num_bits<int128_t>() { return 128; }
-template <> constexpr int num_bits<uint128_t>() { return 128; }
-template <> constexpr int num_bits<fallback_uintptr>() {
+template <> constexpr auto num_bits<int128_t>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_t>() -> int { return 128; }
+template <> constexpr auto num_bits<fallback_uintptr>() -> int {
   return static_cast<int>(sizeof(void*) *
                           std::numeric_limits<unsigned char>::digits);
 }
@@ -346,31 +364,38 @@ using iterator_t = decltype(std::begin(std::declval<T&>()));
 template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
 
 // A workaround for std::string not having mutable data() until C++17.
-template <typename Char> inline Char* get_data(std::basic_string<Char>& s) {
+template <typename Char>
+inline auto get_data(std::basic_string<Char>& s) -> Char* {
   return &s[0];
 }
 template <typename Container>
-inline typename Container::value_type* get_data(Container& c) {
+inline auto get_data(Container& c) -> typename Container::value_type* {
   return c.data();
 }
 
 #if defined(_SECURE_SCL) && _SECURE_SCL
 // Make a checked iterator to avoid MSVC warnings.
 template <typename T> using checked_ptr = stdext::checked_array_iterator<T*>;
-template <typename T> checked_ptr<T> make_checked(T* p, size_t size) {
+template <typename T>
+constexpr auto make_checked(T* p, size_t size) -> checked_ptr<T> {
   return {p, size};
 }
 #else
 template <typename T> using checked_ptr = T*;
-template <typename T> inline T* make_checked(T* p, size_t) { return p; }
+template <typename T> constexpr auto make_checked(T* p, size_t) -> T* {
+  return p;
+}
 #endif
 
+// Attempts to reserve space for n extra characters in the output range.
+// Returns a pointer to the reserved range or a reference to it.
 template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-#if FMT_CLANG_VERSION
+#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
 __attribute__((no_sanitize("undefined")))
 #endif
-inline checked_ptr<typename Container::value_type>
-reserve(std::back_insert_iterator<Container> it, size_t n) {
+inline auto
+reserve(std::back_insert_iterator<Container> it, size_t n)
+    -> checked_ptr<typename Container::value_type> {
   Container& c = get_container(it);
   size_t size = c.size();
   c.resize(size + n);
@@ -378,21 +403,26 @@ reserve(std::back_insert_iterator<Container> it, size_t n) {
 }
 
 template <typename T>
-inline buffer_appender<T> reserve(buffer_appender<T> it, size_t n) {
+inline auto reserve(buffer_appender<T> it, size_t n) -> buffer_appender<T> {
   buffer<T>& buf = get_container(it);
   buf.try_reserve(buf.size() + n);
   return it;
 }
 
-template <typename Iterator> inline Iterator& reserve(Iterator& it, size_t) {
+template <typename Iterator>
+constexpr auto reserve(Iterator& it, size_t) -> Iterator& {
   return it;
 }
 
+template <typename OutputIt>
+using reserve_iterator =
+    remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
+
 template <typename T, typename OutputIt>
-constexpr T* to_pointer(OutputIt, size_t) {
+constexpr auto to_pointer(OutputIt, size_t) -> T* {
   return nullptr;
 }
-template <typename T> T* to_pointer(buffer_appender<T> it, size_t n) {
+template <typename T> auto to_pointer(buffer_appender<T> it, size_t n) -> T* {
   buffer<T>& buf = get_container(it);
   auto size = buf.size();
   if (buf.capacity() < size + n) return nullptr;
@@ -401,195 +431,195 @@ template <typename T> T* to_pointer(buffer_appender<T> it, size_t n) {
 }
 
 template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-inline std::back_insert_iterator<Container> base_iterator(
-    std::back_insert_iterator<Container>& it,
-    checked_ptr<typename Container::value_type>) {
+inline auto base_iterator(std::back_insert_iterator<Container>& it,
+                          checked_ptr<typename Container::value_type>)
+    -> std::back_insert_iterator<Container> {
   return it;
 }
 
 template <typename Iterator>
-inline Iterator base_iterator(Iterator, Iterator it) {
+constexpr auto base_iterator(Iterator, Iterator it) -> Iterator {
   return it;
 }
 
-// An output iterator that counts the number of objects written to it and
-// discards them.
-class counting_iterator {
- private:
-  size_t count_;
+// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
+// instead (#1998).
+template <typename OutputIt, typename Size, typename T>
+FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value)
+    -> OutputIt {
+  for (Size i = 0; i < count; ++i) *out++ = value;
+  return out;
+}
+template <typename T, typename Size>
+FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
+  if (is_constant_evaluated()) {
+    return fill_n<T*, Size, T>(out, count, value);
+  }
+  std::memset(out, value, to_unsigned(count));
+  return out + count;
+}
 
- public:
-  using iterator_category = std::output_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using pointer = void;
-  using reference = void;
-  using _Unchecked_type = counting_iterator;  // Mark iterator as checked.
+#ifdef __cpp_char8_t
+using char8_type = char8_t;
+#else
+enum char8_type : unsigned char {};
+#endif
 
-  struct value_type {
-    template <typename T> void operator=(const T&) {}
+template <typename OutChar, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
+                                                  OutputIt out) -> OutputIt {
+  return copy_str<OutChar>(begin, end, out);
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from s, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
+    -> const char* {
+  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
+  constexpr const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = code_point_length(s);
+  const char* next = s + len;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(s[0] & masks[len]) << 18;
+  *c |= uint32_t(s[1] & 0x3f) << 12;
+  *c |= uint32_t(s[2] & 0x3f) << 6;
+  *c |= uint32_t(s[3] & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  using uchar = unsigned char;
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (uchar(s[1]) & 0xc0) >> 2;
+  *e |= (uchar(s[2]) & 0xc0) >> 4;
+  *e |= uchar(s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+constexpr uint32_t invalid_code_point = ~uint32_t();
+
+// Invokes f(cp, sv) for every code point cp in s with sv being the string view
+// corresponding to the code point. cp is invalid_code_point on error.
+template <typename F>
+FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
+  auto decode = [f](const char* buf_ptr, const char* ptr) {
+    auto cp = uint32_t();
+    auto error = 0;
+    auto end = utf8_decode(buf_ptr, &cp, &error);
+    bool result = f(error ? invalid_code_point : cp,
+                    string_view(ptr, to_unsigned(end - buf_ptr)));
+    return result ? end : nullptr;
   };
-
-  counting_iterator() : count_(0) {}
-
-  size_t count() const { return count_; }
-
-  counting_iterator& operator++() {
-    ++count_;
-    return *this;
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) {
+      p = decode(p, p);
+      if (!p) return;
+    }
   }
-  counting_iterator operator++(int) {
-    auto it = *this;
-    ++*this;
-    return it;
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    copy_str<char>(p, p + num_chars_left, buf);
+    const char* buf_ptr = buf;
+    do {
+      auto end = decode(buf_ptr, p);
+      if (!end) return;
+      p += end - buf_ptr;
+      buf_ptr = end;
+    } while (buf_ptr - buf < num_chars_left);
   }
-
-  friend counting_iterator operator+(counting_iterator it, difference_type n) {
-    it.count_ += static_cast<size_t>(n);
-    return it;
-  }
-
-  value_type operator*() const { return {}; }
-};
-
-template <typename OutputIt> class truncating_iterator_base {
- protected:
-  OutputIt out_;
-  size_t limit_;
-  size_t count_;
-
-  truncating_iterator_base(OutputIt out, size_t limit)
-      : out_(out), limit_(limit), count_(0) {}
-
- public:
-  using iterator_category = std::output_iterator_tag;
-  using value_type = typename std::iterator_traits<OutputIt>::value_type;
-  using difference_type = void;
-  using pointer = void;
-  using reference = void;
-  using _Unchecked_type =
-      truncating_iterator_base;  // Mark iterator as checked.
-
-  OutputIt base() const { return out_; }
-  size_t count() const { return count_; }
-};
-
-// An output iterator that truncates the output and counts the number of objects
-// written to it.
-template <typename OutputIt,
-          typename Enable = typename std::is_void<
-              typename std::iterator_traits<OutputIt>::value_type>::type>
-class truncating_iterator;
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::false_type>
-    : public truncating_iterator_base<OutputIt> {
-  mutable typename truncating_iterator_base<OutputIt>::value_type blackhole_;
-
- public:
-  using value_type = typename truncating_iterator_base<OutputIt>::value_type;
-
-  truncating_iterator(OutputIt out, size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  truncating_iterator& operator++() {
-    if (this->count_++ < this->limit_) ++this->out_;
-    return *this;
-  }
-
-  truncating_iterator operator++(int) {
-    auto it = *this;
-    ++*this;
-    return it;
-  }
-
-  value_type& operator*() const {
-    return this->count_ < this->limit_ ? *this->out_ : blackhole_;
-  }
-};
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::true_type>
-    : public truncating_iterator_base<OutputIt> {
- public:
-  truncating_iterator(OutputIt out, size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  template <typename T> truncating_iterator& operator=(T val) {
-    if (this->count_++ < this->limit_) *this->out_++ = val;
-    return *this;
-  }
-
-  truncating_iterator& operator++() { return *this; }
-  truncating_iterator& operator++(int) { return *this; }
-  truncating_iterator& operator*() { return *this; }
-};
+}
 
 template <typename Char>
-inline size_t count_code_points(basic_string_view<Char> s) {
+inline auto compute_width(basic_string_view<Char> s) -> size_t {
   return s.size();
 }
 
-// Counts the number of code points in a UTF-8 string.
-inline size_t count_code_points(basic_string_view<char> s) {
-  const char* data = s.data();
+// Computes approximate display width of a UTF-8 string.
+FMT_CONSTEXPR inline size_t compute_width(string_view s) {
   size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80) ++num_code_points;
-  }
+  // It is not a lambda for compatibility with C++14.
+  struct count_code_points {
+    size_t* count;
+    FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
+      *count += detail::to_unsigned(
+          1 +
+          (cp >= 0x1100 &&
+           (cp <= 0x115f ||  // Hangul Jamo init. consonants
+            cp == 0x2329 ||  // LEFT-POINTING ANGLE BRACKET
+            cp == 0x232a ||  // RIGHT-POINTING ANGLE BRACKET
+            // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE:
+            (cp >= 0x2e80 && cp <= 0xa4cf && cp != 0x303f) ||
+            (cp >= 0xac00 && cp <= 0xd7a3) ||    // Hangul Syllables
+            (cp >= 0xf900 && cp <= 0xfaff) ||    // CJK Compatibility Ideographs
+            (cp >= 0xfe10 && cp <= 0xfe19) ||    // Vertical Forms
+            (cp >= 0xfe30 && cp <= 0xfe6f) ||    // CJK Compatibility Forms
+            (cp >= 0xff00 && cp <= 0xff60) ||    // Fullwidth Forms
+            (cp >= 0xffe0 && cp <= 0xffe6) ||    // Fullwidth Forms
+            (cp >= 0x20000 && cp <= 0x2fffd) ||  // CJK
+            (cp >= 0x30000 && cp <= 0x3fffd) ||
+            // Miscellaneous Symbols and Pictographs + Emoticons:
+            (cp >= 0x1f300 && cp <= 0x1f64f) ||
+            // Supplemental Symbols and Pictographs:
+            (cp >= 0x1f900 && cp <= 0x1f9ff))));
+      return true;
+    }
+  };
+  for_each_codepoint(s, count_code_points{&num_code_points});
   return num_code_points;
 }
 
-inline size_t count_code_points(basic_string_view<char8_type> s) {
-  return count_code_points(basic_string_view<char>(
+inline auto compute_width(basic_string_view<char8_type> s) -> size_t {
+  return compute_width(basic_string_view<char>(
       reinterpret_cast<const char*>(s.data()), s.size()));
 }
 
 template <typename Char>
-inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
+inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
   size_t size = s.size();
   return n < size ? n : size;
 }
 
 // Calculates the index of the nth code point in a UTF-8 string.
-inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
+inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
+    -> size_t {
   const char8_type* data = s.data();
   size_t num_code_points = 0;
   for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) {
-      return i;
-    }
+    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
   }
   return s.size();
 }
 
-template <typename InputIt, typename OutChar>
-using needs_conversion = bool_constant<
-    std::is_same<typename std::iterator_traits<InputIt>::value_type,
-                 char>::value &&
-    std::is_same<OutChar, char8_type>::value>;
-
-template <typename OutChar, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(!needs_conversion<InputIt, OutChar>::value)>
-OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
-  return std::copy(begin, end, it);
-}
-
-template <typename OutChar, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(needs_conversion<InputIt, OutChar>::value)>
-OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
-  return std::transform(begin, end, it,
-                        [](char c) { return static_cast<char8_type>(c); });
-}
-
-template <typename Char, typename InputIt>
-inline counting_iterator copy_str(InputIt begin, InputIt end,
-                                  counting_iterator it) {
-  return it + (end - begin);
-}
-
-template <typename T>
-using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
-                                    sizeof(T) <= sizeof(double)>;
+template <typename T, bool = std::is_floating_point<T>::value>
+struct is_fast_float : bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                     sizeof(T) <= sizeof(double)> {};
+template <typename T> struct is_fast_float<T, false> : std::false_type {};
 
 #ifndef FMT_USE_FULL_CACHE_DRAGONBOX
 #  define FMT_USE_FULL_CACHE_DRAGONBOX 0
@@ -598,7 +628,7 @@ using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
 template <typename T>
 template <typename U>
 void buffer<T>::append(const U* begin, const U* end) {
-  do {
+  while (begin != end) {
     auto count = to_unsigned(end - begin);
     try_reserve(size_ + count);
     auto free_cap = capacity_ - size_;
@@ -606,16 +636,17 @@ void buffer<T>::append(const U* begin, const U* end) {
     std::uninitialized_copy_n(begin, count, make_checked(ptr_ + size_, count));
     size_ += count;
     begin += count;
-  } while (begin != end);
+  }
 }
 
-template <typename OutputIt, typename T, typename Traits>
-void iterator_buffer<OutputIt, T, Traits>::flush() {
-  out_ = std::copy_n(data_, this->limit(this->size()), out_);
-  this->clear();
-}
+template <typename T, typename Enable = void>
+struct is_locale : std::false_type {};
+template <typename T>
+struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
 }  // namespace detail
 
+FMT_MODULE_EXPORT_BEGIN
+
 // The number of characters to store in the basic_memory_buffer object itself
 // to avoid dynamic memory allocation.
 enum { inline_buffer_size = 500 };
@@ -625,20 +656,12 @@ enum { inline_buffer_size = 500 };
   A dynamically growing memory buffer for trivially copyable/constructible types
   with the first ``SIZE`` elements stored in the object itself.
 
-  You can use one of the following type aliases for common character types:
-
-  +----------------+------------------------------+
-  | Type           | Definition                   |
-  +================+==============================+
-  | memory_buffer  | basic_memory_buffer<char>    |
-  +----------------+------------------------------+
-  | wmemory_buffer | basic_memory_buffer<wchar_t> |
-  +----------------+------------------------------+
+  You can use the ``memory_buffer`` type alias for ``char`` instead.
 
   **Example**::
 
-     fmt::memory_buffer out;
-     format_to(out, "The answer is {}.", 42);
+     auto out = fmt::memory_buffer();
+     format_to(std::back_inserter(out), "The answer is {}.", 42);
 
   This will append the following output to the ``out`` object:
 
@@ -659,34 +682,43 @@ class basic_memory_buffer final : public detail::buffer<T> {
   Allocator alloc_;
 
   // Deallocate memory allocated by the buffer.
-  void deallocate() {
+  FMT_CONSTEXPR20 void deallocate() {
     T* data = this->data();
     if (data != store_) alloc_.deallocate(data, this->capacity());
   }
 
  protected:
-  void grow(size_t size) final FMT_OVERRIDE;
+  FMT_CONSTEXPR20 void grow(size_t size) override;
 
  public:
   using value_type = T;
   using const_reference = const T&;
 
-  explicit basic_memory_buffer(const Allocator& alloc = Allocator())
+  FMT_CONSTEXPR20 explicit basic_memory_buffer(
+      const Allocator& alloc = Allocator())
       : alloc_(alloc) {
     this->set(store_, SIZE);
+    if (detail::is_constant_evaluated()) {
+      detail::fill_n(store_, SIZE, T{});
+    }
   }
-  ~basic_memory_buffer() { deallocate(); }
+  FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); }
 
  private:
   // Move data from other to this buffer.
-  void move(basic_memory_buffer& other) {
+  FMT_CONSTEXPR20 void move(basic_memory_buffer& other) {
     alloc_ = std::move(other.alloc_);
     T* data = other.data();
     size_t size = other.size(), capacity = other.capacity();
     if (data == other.store_) {
       this->set(store_, capacity);
-      std::uninitialized_copy(other.store_, other.store_ + size,
-                              detail::make_checked(store_, capacity));
+      if (detail::is_constant_evaluated()) {
+        detail::copy_str<T>(other.store_, other.store_ + size,
+                            detail::make_checked(store_, capacity));
+      } else {
+        std::uninitialized_copy(other.store_, other.store_ + size,
+                                detail::make_checked(store_, capacity));
+      }
     } else {
       this->set(data, capacity);
       // Set pointer to the inline array so that delete is not called
@@ -703,14 +735,18 @@ class basic_memory_buffer final : public detail::buffer<T> {
     of the other object to it.
     \endrst
    */
-  basic_memory_buffer(basic_memory_buffer&& other) FMT_NOEXCEPT { move(other); }
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other)
+      FMT_NOEXCEPT {
+    move(other);
+  }
 
   /**
     \rst
     Moves the content of the other ``basic_memory_buffer`` object to this one.
     \endrst
    */
-  basic_memory_buffer& operator=(basic_memory_buffer&& other) FMT_NOEXCEPT {
+  auto operator=(basic_memory_buffer&& other) FMT_NOEXCEPT
+      -> basic_memory_buffer& {
     FMT_ASSERT(this != &other, "");
     deallocate();
     move(other);
@@ -718,13 +754,13 @@ class basic_memory_buffer final : public detail::buffer<T> {
   }
 
   // Returns a copy of the allocator associated with this buffer.
-  Allocator get_allocator() const { return alloc_; }
+  auto get_allocator() const -> Allocator { return alloc_; }
 
   /**
     Resizes the buffer to contain *count* elements. If T is a POD type new
     elements may not be initialized.
    */
-  void resize(size_t count) { this->try_resize(count); }
+  FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
 
   /** Increases the buffer capacity to *new_capacity*. */
   void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
@@ -738,13 +774,18 @@ class basic_memory_buffer final : public detail::buffer<T> {
 };
 
 template <typename T, size_t SIZE, typename Allocator>
-void basic_memory_buffer<T, SIZE, Allocator>::grow(size_t size) {
+FMT_CONSTEXPR20 void basic_memory_buffer<T, SIZE, Allocator>::grow(
+    size_t size) {
 #ifdef FMT_FUZZ
   if (size > 5000) throw std::runtime_error("fuzz mode - won't grow that much");
 #endif
+  const size_t max_size = std::allocator_traits<Allocator>::max_size(alloc_);
   size_t old_capacity = this->capacity();
   size_t new_capacity = old_capacity + old_capacity / 2;
-  if (size > new_capacity) new_capacity = size;
+  if (size > new_capacity)
+    new_capacity = size;
+  else if (new_capacity > max_size)
+    new_capacity = size > max_size ? size : max_size;
   T* old_data = this->data();
   T* new_data =
       std::allocator_traits<Allocator>::allocate(alloc_, new_capacity);
@@ -759,12 +800,15 @@ void basic_memory_buffer<T, SIZE, Allocator>::grow(size_t size) {
 }
 
 using memory_buffer = basic_memory_buffer<char>;
-using wmemory_buffer = basic_memory_buffer<wchar_t>;
 
 template <typename T, size_t SIZE, typename Allocator>
 struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
 };
 
+namespace detail {
+FMT_API void print(std::FILE*, string_view);
+}
+
 /** A formatting error such as invalid format string. */
 FMT_CLASS_API
 class FMT_API format_error : public std::runtime_error {
@@ -776,10 +820,62 @@ class FMT_API format_error : public std::runtime_error {
   format_error& operator=(const format_error&) = default;
   format_error(format_error&&) = default;
   format_error& operator=(format_error&&) = default;
-  ~format_error() FMT_NOEXCEPT FMT_OVERRIDE;
+  ~format_error() FMT_NOEXCEPT override FMT_MSC_DEFAULT;
 };
 
-namespace detail {
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references
+  to arguments and can be implicitly converted to `~fmt::format_args`.
+  If ``fmt`` is a compile-time string then `make_args_checked` checks
+  its validity at compile time.
+  \endrst
+ */
+template <typename... Args, typename S, typename Char = char_t<S>>
+FMT_INLINE auto make_args_checked(const S& fmt,
+                                  const remove_reference_t<Args>&... args)
+    -> format_arg_store<buffer_context<Char>, remove_reference_t<Args>...> {
+  static_assert(
+      detail::count<(
+              std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+              std::is_reference<Args>::value)...>() == 0,
+      "passing views as lvalues is disallowed");
+  detail::check_format_string<Args...>(fmt);
+  return {args...};
+}
+
+// compile-time support
+namespace detail_exported {
+#if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+template <typename Char, size_t N> struct fixed_string {
+  constexpr fixed_string(const Char (&str)[N]) {
+    detail::copy_str<Char, const Char*, Char*>(static_cast<const Char*>(str),
+                                               str + N, data);
+  }
+  Char data[N]{};
+};
+#endif
+
+// Converts a compile-time string to basic_string_view.
+template <typename Char, size_t N>
+constexpr auto compile_string_to_view(const Char (&s)[N])
+    -> basic_string_view<Char> {
+  // Remove trailing NUL character if needed. Won't be present if this is used
+  // with a raw character array (i.e. not defined as a string).
+  return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
+}
+template <typename Char>
+constexpr auto compile_string_to_view(detail::std_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return {s.data(), s.size()};
+}
+}  // namespace detail_exported
+
+FMT_BEGIN_DETAIL_NAMESPACE
+
+template <typename T> struct is_integral : std::is_integral<T> {};
+template <> struct is_integral<int128_t> : std::true_type {};
+template <> struct is_integral<uint128_t> : std::true_type {};
 
 template <typename T>
 using is_signed =
@@ -789,16 +885,16 @@ using is_signed =
 // Returns true if value is negative, false otherwise.
 // Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
 template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
-FMT_CONSTEXPR bool is_negative(T value) {
+FMT_CONSTEXPR auto is_negative(T value) -> bool {
   return value < 0;
 }
 template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
-FMT_CONSTEXPR bool is_negative(T) {
+FMT_CONSTEXPR auto is_negative(T) -> bool {
   return false;
 }
 
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-FMT_CONSTEXPR bool is_supported_floating_point(T) {
+FMT_CONSTEXPR auto is_supported_floating_point(T) -> uint16_t {
   return (std::is_same<T, float>::value && FMT_USE_FLOAT) ||
          (std::is_same<T, double>::value && FMT_USE_DOUBLE) ||
          (std::is_same<T, long double>::value && FMT_USE_LONG_DOUBLE);
@@ -811,121 +907,33 @@ using uint32_or_64_or_128_t =
     conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
                   uint32_t,
                   conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
+template <typename T>
+using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
 
-// 128-bit integer type used internally
-struct FMT_EXTERN_TEMPLATE_API uint128_wrapper {
-  uint128_wrapper() = default;
+#define FMT_POWERS_OF_10(factor)                                             \
+  factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
+      (factor)*1000000, (factor)*10000000, (factor)*100000000,               \
+      (factor)*1000000000
 
-#if FMT_USE_INT128
-  uint128_t internal_;
-
-  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT
-      : internal_{static_cast<uint128_t>(low) |
-                  (static_cast<uint128_t>(high) << 64)} {}
-
-  uint128_wrapper(uint128_t u) : internal_{u} {}
-
-  uint64_t high() const FMT_NOEXCEPT { return uint64_t(internal_ >> 64); }
-  uint64_t low() const FMT_NOEXCEPT { return uint64_t(internal_); }
-
-  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
-    internal_ += n;
-    return *this;
-  }
-#else
-  uint64_t high_;
-  uint64_t low_;
-
-  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT : high_{high},
-                                                              low_{low} {}
-
-  uint64_t high() const FMT_NOEXCEPT { return high_; }
-  uint64_t low() const FMT_NOEXCEPT { return low_; }
-
-  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
-#  if defined(_MSC_VER) && defined(_M_X64)
-    unsigned char carry = _addcarry_u64(0, low_, n, &low_);
-    _addcarry_u64(carry, high_, 0, &high_);
-    return *this;
-#  else
-    uint64_t sum = low_ + n;
-    high_ += (sum < low_ ? 1 : 0);
-    low_ = sum;
-    return *this;
-#  endif
-  }
-#endif
-};
-
-// Table entry type for divisibility test used internally
-template <typename T> struct FMT_EXTERN_TEMPLATE_API divtest_table_entry {
-  T mod_inv;
-  T max_quotient;
-};
-
-// Static data is placed in this class template for the header-only config.
-template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
-  static const uint64_t powers_of_10_64[];
-  static const uint32_t zero_or_powers_of_10_32_new[];
-  static const uint64_t zero_or_powers_of_10_64_new[];
-  static const uint64_t grisu_pow10_significands[];
-  static const int16_t grisu_pow10_exponents[];
-  static const divtest_table_entry<uint32_t> divtest_table_for_pow5_32[];
-  static const divtest_table_entry<uint64_t> divtest_table_for_pow5_64[];
-  static const uint64_t dragonbox_pow10_significands_64[];
-  static const uint128_wrapper dragonbox_pow10_significands_128[];
-  // log10(2) = 0x0.4d104d427de7fbcc...
-  static const uint64_t log10_2_significand = 0x4d104d427de7fbcc;
-#if !FMT_USE_FULL_CACHE_DRAGONBOX
-  static const uint64_t powers_of_5_64[];
-  static const uint32_t dragonbox_pow10_recovery_errors[];
-#endif
-  // GCC generates slightly better code for pairs than chars.
-  using digit_pair = char[2];
-  static const digit_pair digits[];
-  static const char hex_digits[];
-  static const char foreground_color[];
-  static const char background_color[];
-  static const char reset_color[5];
-  static const wchar_t wreset_color[5];
-  static const char signs[];
-  static const char left_padding_shifts[5];
-  static const char right_padding_shifts[5];
-
-  // DEPRECATED! These are for ABI compatibility.
-  static const uint32_t zero_or_powers_of_10_32[];
-  static const uint64_t zero_or_powers_of_10_64[];
-};
-
-// Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
-// This is a function instead of an array to workaround a bug in GCC10 (#1810).
-FMT_INLINE uint16_t bsr2log10(int bsr) {
-  static constexpr uint16_t data[] = {
-      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
-      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
-      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
-      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
-  return data[bsr];
+// Converts value in the range [0, 100) to a string.
+constexpr const char* digits2(size_t value) {
+  // GCC generates slightly better code when value is pointer-size.
+  return &"0001020304050607080910111213141516171819"
+         "2021222324252627282930313233343536373839"
+         "4041424344454647484950515253545556575859"
+         "6061626364656667686970717273747576777879"
+         "8081828384858687888990919293949596979899"[value * 2];
 }
 
-#ifndef FMT_EXPORTED
-FMT_EXTERN template struct basic_data<void>;
+// Sign is a template parameter to workaround a bug in gcc 4.8.
+template <typename Char, typename Sign> constexpr Char sign(Sign s) {
+#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
+  static_assert(std::is_same<Sign, sign_t>::value, "");
 #endif
-
-// This is a struct rather than an alias to avoid shadowing warnings in gcc.
-struct data : basic_data<> {};
-
-#ifdef FMT_BUILTIN_CLZLL
-// Returns the number of decimal digits in n. Leading zeros are not counted
-// except for n == 0 in which case count_digits returns 1.
-inline int count_digits(uint64_t n) {
-  // https://github.com/fmtlib/format-benchmark/blob/master/digits10
-  auto t = bsr2log10(FMT_BUILTIN_CLZLL(n | 1) ^ 63);
-  return t - (n < data::zero_or_powers_of_10_64_new[t]);
+  return static_cast<Char>("\0-+ "[s]);
 }
-#else
-// Fallback version of count_digits used when __builtin_clz is not available.
-inline int count_digits(uint64_t n) {
+
+template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
   int count = 1;
   for (;;) {
     // Integer division is slow so do it for a group of four digits instead
@@ -939,103 +947,152 @@ inline int count_digits(uint64_t n) {
     count += 4;
   }
 }
-#endif
-
 #if FMT_USE_INT128
-inline int count_digits(uint128_t n) {
-  int count = 1;
-  for (;;) {
-    // Integer division is slow so do it for a group of four digits instead
-    // of for every digit. The idea comes from the talk by Alexandrescu
-    // "Three Optimization Tips for C++". See speed-test for a comparison.
-    if (n < 10) return count;
-    if (n < 100) return count + 1;
-    if (n < 1000) return count + 2;
-    if (n < 10000) return count + 3;
-    n /= 10000U;
-    count += 4;
-  }
+FMT_CONSTEXPR inline auto count_digits(uint128_t n) -> int {
+  return count_digits_fallback(n);
 }
 #endif
 
+#ifdef FMT_BUILTIN_CLZLL
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+inline auto do_count_digits(uint64_t n) -> int {
+  // This has comparable performance to the version by Kendall Willets
+  // (https://github.com/fmtlib/format-benchmark/blob/master/digits10)
+  // but uses smaller tables.
+  // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+  static constexpr uint8_t bsr2log10[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63];
+  static constexpr const uint64_t zero_or_powers_of_10[] = {
+      0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+  return t - (n < zero_or_powers_of_10[t]);
+}
+#endif
+
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
+#endif
+  return count_digits_fallback(n);
+}
+
 // Counts the number of digits in n. BITS = log2(radix).
-template <unsigned BITS, typename UInt> inline int count_digits(UInt n) {
-  int num_digits = 0;
-  do {
-    ++num_digits;
-  } while ((n >>= BITS) != 0);
-  return num_digits;
+template <int BITS, typename UInt>
+FMT_CONSTEXPR auto count_digits(UInt n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (num_bits<UInt>() == 32)
+    return (FMT_BUILTIN_CLZ(static_cast<uint32_t>(n) | 1) ^ 31) / BITS + 1;
+#endif
+  // Lambda avoids unreachable code warnings from NVHPC.
+  return [](UInt m) {
+    int num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((m >>= BITS) != 0);
+    return num_digits;
+  }(n);
 }
 
-template <> int count_digits<4>(detail::fallback_uintptr n);
-
-#if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
-#elif FMT_MSC_VER
-#  define FMT_ALWAYS_INLINE __forceinline
-#else
-#  define FMT_ALWAYS_INLINE inline
-#endif
-
-// To suppress unnecessary security cookie checks
-#if FMT_MSC_VER && !FMT_CLANG_VERSION
-#  define FMT_SAFEBUFFERS __declspec(safebuffers)
-#else
-#  define FMT_SAFEBUFFERS
-#endif
+template <> auto count_digits<4>(detail::fallback_uintptr n) -> int;
 
 #ifdef FMT_BUILTIN_CLZ
-// Optional version of count_digits for better performance on 32-bit platforms.
-inline int count_digits(uint32_t n) {
-  auto t = bsr2log10(FMT_BUILTIN_CLZ(n | 1) ^ 31);
-  return t - (n < data::zero_or_powers_of_10_32_new[t]);
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+FMT_INLINE auto do_count_digits(uint32_t n) -> int {
+// An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
+// This increments the upper 32 bits (log10(T) - 1) when >= T is added.
+#  define FMT_INC(T) (((sizeof(#  T) - 1ull) << 32) - T)
+  static constexpr uint64_t table[] = {
+      FMT_INC(0),          FMT_INC(0),          FMT_INC(0),           // 8
+      FMT_INC(10),         FMT_INC(10),         FMT_INC(10),          // 64
+      FMT_INC(100),        FMT_INC(100),        FMT_INC(100),         // 512
+      FMT_INC(1000),       FMT_INC(1000),       FMT_INC(1000),        // 4096
+      FMT_INC(10000),      FMT_INC(10000),      FMT_INC(10000),       // 32k
+      FMT_INC(100000),     FMT_INC(100000),     FMT_INC(100000),      // 256k
+      FMT_INC(1000000),    FMT_INC(1000000),    FMT_INC(1000000),     // 2048k
+      FMT_INC(10000000),   FMT_INC(10000000),   FMT_INC(10000000),    // 16M
+      FMT_INC(100000000),  FMT_INC(100000000),  FMT_INC(100000000),   // 128M
+      FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000),  // 1024M
+      FMT_INC(1000000000), FMT_INC(1000000000)                        // 4B
+  };
+  auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31];
+  return static_cast<int>((n + inc) >> 32);
 }
 #endif
 
-template <typename Int> constexpr int digits10() FMT_NOEXCEPT {
+// Optional version of count_digits for better performance on 32-bit platforms.
+FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
+#endif
+  return count_digits_fallback(n);
+}
+
+template <typename Int> constexpr auto digits10() FMT_NOEXCEPT -> int {
   return std::numeric_limits<Int>::digits10;
 }
-template <> constexpr int digits10<int128_t>() FMT_NOEXCEPT { return 38; }
-template <> constexpr int digits10<uint128_t>() FMT_NOEXCEPT { return 38; }
-
-template <typename Char> FMT_API std::string grouping_impl(locale_ref loc);
-template <typename Char> inline std::string grouping(locale_ref loc) {
-  return grouping_impl<char>(loc);
+template <> constexpr auto digits10<int128_t>() FMT_NOEXCEPT -> int {
+  return 38;
 }
-template <> inline std::string grouping<wchar_t>(locale_ref loc) {
-  return grouping_impl<wchar_t>(loc);
+template <> constexpr auto digits10<uint128_t>() FMT_NOEXCEPT -> int {
+  return 38;
 }
 
-template <typename Char> FMT_API Char thousands_sep_impl(locale_ref loc);
-template <typename Char> inline Char thousands_sep(locale_ref loc) {
-  return Char(thousands_sep_impl<char>(loc));
+template <typename Char> struct thousands_sep_result {
+  std::string grouping;
+  Char thousands_sep;
+};
+
+template <typename Char>
+FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char>;
+template <typename Char>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<Char> {
+  auto result = thousands_sep_impl<char>(loc);
+  return {result.grouping, Char(result.thousands_sep)};
 }
-template <> inline wchar_t thousands_sep(locale_ref loc) {
+template <>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<wchar_t> {
   return thousands_sep_impl<wchar_t>(loc);
 }
 
-template <typename Char> FMT_API Char decimal_point_impl(locale_ref loc);
-template <typename Char> inline Char decimal_point(locale_ref loc) {
+template <typename Char>
+FMT_API auto decimal_point_impl(locale_ref loc) -> Char;
+template <typename Char> inline auto decimal_point(locale_ref loc) -> Char {
   return Char(decimal_point_impl<char>(loc));
 }
-template <> inline wchar_t decimal_point(locale_ref loc) {
+template <> inline auto decimal_point(locale_ref loc) -> wchar_t {
   return decimal_point_impl<wchar_t>(loc);
 }
 
 // Compares two characters for equality.
-template <typename Char> bool equal2(const Char* lhs, const char* rhs) {
-  return lhs[0] == rhs[0] && lhs[1] == rhs[1];
+template <typename Char> auto equal2(const Char* lhs, const char* rhs) -> bool {
+  return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
 }
-inline bool equal2(const char* lhs, const char* rhs) {
+inline auto equal2(const char* lhs, const char* rhs) -> bool {
   return memcmp(lhs, rhs, 2) == 0;
 }
 
 // Copies two characters from src to dst.
-template <typename Char> void copy2(Char* dst, const char* src) {
+template <typename Char>
+FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) {
+  if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) {
+    memcpy(dst, src, 2);
+    return;
+  }
   *dst++ = static_cast<Char>(*src++);
   *dst = static_cast<Char>(*src);
 }
-FMT_INLINE void copy2(char* dst, const char* src) { memcpy(dst, src, 2); }
 
 template <typename Iterator> struct format_decimal_result {
   Iterator begin;
@@ -1046,8 +1103,8 @@ template <typename Iterator> struct format_decimal_result {
 // buffer of specified size. The caller must ensure that the buffer is large
 // enough.
 template <typename Char, typename UInt>
-inline format_decimal_result<Char*> format_decimal(Char* out, UInt value,
-                                                   int size) {
+FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size)
+    -> format_decimal_result<Char*> {
   FMT_ASSERT(size >= count_digits(value), "invalid digit count");
   out += size;
   Char* end = out;
@@ -1056,7 +1113,7 @@ inline format_decimal_result<Char*> format_decimal(Char* out, UInt value,
     // of for every digit. The idea comes from the talk by Alexandrescu
     // "Three Optimization Tips for C++". See speed-test for a comparison.
     out -= 2;
-    copy2(out, data::digits[value % 100]);
+    copy2(out, digits2(static_cast<size_t>(value % 100)));
     value /= 100;
   }
   if (value < 10) {
@@ -1064,27 +1121,27 @@ inline format_decimal_result<Char*> format_decimal(Char* out, UInt value,
     return {out, end};
   }
   out -= 2;
-  copy2(out, data::digits[value]);
+  copy2(out, digits2(static_cast<size_t>(value)));
   return {out, end};
 }
 
 template <typename Char, typename UInt, typename Iterator,
           FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
-inline format_decimal_result<Iterator> format_decimal(Iterator out, UInt value,
-                                                      int size) {
+inline auto format_decimal(Iterator out, UInt value, int size)
+    -> format_decimal_result<Iterator> {
   // Buffer is large enough to hold all digits (digits10 + 1).
   Char buffer[digits10<UInt>() + 1];
   auto end = format_decimal(buffer, value, size).end;
-  return {out, detail::copy_str<Char>(buffer, end, out)};
+  return {out, detail::copy_str_noinline<Char>(buffer, end, out)};
 }
 
 template <unsigned BASE_BITS, typename Char, typename UInt>
-inline Char* format_uint(Char* buffer, UInt value, int num_digits,
-                         bool upper = false) {
+FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits,
+                               bool upper = false) -> Char* {
   buffer += num_digits;
   Char* end = buffer;
   do {
-    const char* digits = upper ? "0123456789ABCDEF" : data::hex_digits;
+    const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
     unsigned digit = (value & ((1 << BASE_BITS) - 1));
     *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
                                                 : digits[digit]);
@@ -1093,8 +1150,8 @@ inline Char* format_uint(Char* buffer, UInt value, int num_digits,
 }
 
 template <unsigned BASE_BITS, typename Char>
-Char* format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
-                  bool = false) {
+auto format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
+                 bool = false) -> Char* {
   auto char_digits = std::numeric_limits<unsigned char>::digits / 4;
   int start = (num_digits + char_digits - 1) / char_digits - 1;
   if (int start_digits = num_digits % char_digits) {
@@ -1107,7 +1164,7 @@ Char* format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
     auto p = buffer;
     for (int i = 0; i < char_digits; ++i) {
       unsigned digit = (value & ((1 << BASE_BITS) - 1));
-      *--p = static_cast<Char>(data::hex_digits[digit]);
+      *--p = static_cast<Char>("0123456789abcdef"[digit]);
       value >>= BASE_BITS;
     }
   }
@@ -1115,7 +1172,8 @@ Char* format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
 }
 
 template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
-inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
+inline auto format_uint(It out, UInt value, int num_digits, bool upper = false)
+    -> It {
   if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
     format_uint<BASE_BITS>(ptr, value, num_digits, upper);
     return out;
@@ -1123,86 +1181,22 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
   // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
   char buffer[num_bits<UInt>() / BASE_BITS + 1];
   format_uint<BASE_BITS>(buffer, value, num_digits, upper);
-  return detail::copy_str<Char>(buffer, buffer + num_digits, out);
+  return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
 }
 
 // A converter from UTF-8 to UTF-16.
 class utf8_to_utf16 {
  private:
-  wmemory_buffer buffer_;
+  basic_memory_buffer<wchar_t> buffer_;
 
  public:
   FMT_API explicit utf8_to_utf16(string_view s);
-  operator wstring_view() const { return {&buffer_[0], size()}; }
-  size_t size() const { return buffer_.size() - 1; }
-  const wchar_t* c_str() const { return &buffer_[0]; }
-  std::wstring str() const { return {&buffer_[0], size()}; }
+  operator basic_string_view<wchar_t>() const { return {&buffer_[0], size()}; }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const wchar_t* { return &buffer_[0]; }
+  auto str() const -> std::wstring { return {&buffer_[0], size()}; }
 };
 
-template <typename T = void> struct null {};
-
-// Workaround an array initialization issue in gcc 4.8.
-template <typename Char> struct fill_t {
- private:
-  enum { max_size = 4 };
-  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
-  unsigned char size_ = 1;
-
- public:
-  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
-    auto size = s.size();
-    if (size > max_size) {
-      FMT_THROW(format_error("invalid fill"));
-      return;
-    }
-    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
-    size_ = static_cast<unsigned char>(size);
-  }
-
-  size_t size() const { return size_; }
-  const Char* data() const { return data_; }
-
-  FMT_CONSTEXPR Char& operator[](size_t index) { return data_[index]; }
-  FMT_CONSTEXPR const Char& operator[](size_t index) const {
-    return data_[index];
-  }
-};
-}  // namespace detail
-
-// We cannot use enum classes as bit fields because of a gcc bug
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414.
-namespace align {
-enum type { none, left, right, center, numeric };
-}
-using align_t = align::type;
-
-namespace sign {
-enum type { none, minus, plus, space };
-}
-using sign_t = sign::type;
-
-// Format specifiers for built-in and string types.
-template <typename Char> struct basic_format_specs {
-  int width;
-  int precision;
-  char type;
-  align_t align : 4;
-  sign_t sign : 3;
-  bool alt : 1;  // Alternate form ('#').
-  detail::fill_t<Char> fill;
-
-  constexpr basic_format_specs()
-      : width(0),
-        precision(-1),
-        type(0),
-        align(align::none),
-        sign(sign::none),
-        alt(false) {}
-};
-
-using format_specs = basic_format_specs<char>;
-
-namespace detail {
 namespace dragonbox {
 
 // Type-specific information that Dragonbox uses.
@@ -1266,37 +1260,21 @@ template <typename T> struct decimal_fp {
   int exponent;
 };
 
-template <typename T> FMT_API decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT;
+template <typename T>
+FMT_API auto to_decimal(T x) FMT_NOEXCEPT -> decimal_fp<T>;
 }  // namespace dragonbox
 
 template <typename T>
-constexpr typename dragonbox::float_info<T>::carrier_uint exponent_mask() {
+constexpr auto exponent_mask() ->
+    typename dragonbox::float_info<T>::carrier_uint {
   using uint = typename dragonbox::float_info<T>::carrier_uint;
   return ((uint(1) << dragonbox::float_info<T>::exponent_bits) - 1)
          << dragonbox::float_info<T>::significand_bits;
 }
 
-// A floating-point presentation format.
-enum class float_format : unsigned char {
-  general,  // General: exponent notation or fixed point based on magnitude.
-  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
-  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
-  hex
-};
-
-struct float_specs {
-  int precision;
-  float_format format : 8;
-  sign_t sign : 8;
-  bool upper : 1;
-  bool locale : 1;
-  bool binary32 : 1;
-  bool use_grisu : 1;
-  bool showpoint : 1;
-};
-
 // Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
-template <typename Char, typename It> It write_exponent(int exp, It it) {
+template <typename Char, typename It>
+FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It {
   FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
   if (exp < 0) {
     *it++ = static_cast<Char>('-');
@@ -1305,185 +1283,42 @@ template <typename Char, typename It> It write_exponent(int exp, It it) {
     *it++ = static_cast<Char>('+');
   }
   if (exp >= 100) {
-    const char* top = data::digits[exp / 100];
+    const char* top = digits2(to_unsigned(exp / 100));
     if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
     *it++ = static_cast<Char>(top[1]);
     exp %= 100;
   }
-  const char* d = data::digits[exp];
+  const char* d = digits2(to_unsigned(exp));
   *it++ = static_cast<Char>(d[0]);
   *it++ = static_cast<Char>(d[1]);
   return it;
 }
 
 template <typename T>
-int format_float(T value, int precision, float_specs specs, buffer<char>& buf);
+FMT_HEADER_ONLY_CONSTEXPR20 auto format_float(T value, int precision,
+                                              float_specs specs,
+                                              buffer<char>& buf) -> int;
 
 // Formats a floating-point number with snprintf.
 template <typename T>
-int snprintf_float(T value, int precision, float_specs specs,
-                   buffer<char>& buf);
+auto snprintf_float(T value, int precision, float_specs specs,
+                    buffer<char>& buf) -> int;
 
-template <typename T> T promote_float(T value) { return value; }
-inline double promote_float(float value) { return static_cast<double>(value); }
-
-template <typename Handler>
-FMT_CONSTEXPR void handle_int_type_spec(char spec, Handler&& handler) {
-  switch (spec) {
-  case 0:
-  case 'd':
-    handler.on_dec();
-    break;
-  case 'x':
-  case 'X':
-    handler.on_hex();
-    break;
-  case 'b':
-  case 'B':
-    handler.on_bin();
-    break;
-  case 'o':
-    handler.on_oct();
-    break;
-#ifdef FMT_DEPRECATED_N_SPECIFIER
-  case 'n':
-#endif
-  case 'L':
-    handler.on_num();
-    break;
-  case 'c':
-    handler.on_chr();
-    break;
-  default:
-    handler.on_error();
-  }
+template <typename T> constexpr auto promote_float(T value) -> T {
+  return value;
 }
-
-template <typename ErrorHandler = error_handler, typename Char>
-FMT_CONSTEXPR float_specs parse_float_type_spec(
-    const basic_format_specs<Char>& specs, ErrorHandler&& eh = {}) {
-  auto result = float_specs();
-  result.showpoint = specs.alt;
-  switch (specs.type) {
-  case 0:
-    result.format = float_format::general;
-    result.showpoint |= specs.precision > 0;
-    break;
-  case 'G':
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case 'g':
-    result.format = float_format::general;
-    break;
-  case 'E':
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case 'e':
-    result.format = float_format::exp;
-    result.showpoint |= specs.precision != 0;
-    break;
-  case 'F':
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case 'f':
-    result.format = float_format::fixed;
-    result.showpoint |= specs.precision != 0;
-    break;
-  case 'A':
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case 'a':
-    result.format = float_format::hex;
-    break;
-#ifdef FMT_DEPRECATED_N_SPECIFIER
-  case 'n':
-#endif
-  case 'L':
-    result.locale = true;
-    break;
-  default:
-    eh.on_error("invalid type specifier");
-    break;
-  }
-  return result;
+constexpr auto promote_float(float value) -> double {
+  return static_cast<double>(value);
 }
 
-template <typename Char, typename Handler>
-FMT_CONSTEXPR void handle_char_specs(const basic_format_specs<Char>* specs,
-                                     Handler&& handler) {
-  if (!specs) return handler.on_char();
-  if (specs->type && specs->type != 'c') return handler.on_int();
-  if (specs->align == align::numeric || specs->sign != sign::none || specs->alt)
-    handler.on_error("invalid format specifier for char");
-  handler.on_char();
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR void handle_cstring_type_spec(Char spec, Handler&& handler) {
-  if (spec == 0 || spec == 's')
-    handler.on_string();
-  else if (spec == 'p')
-    handler.on_pointer();
-  else
-    handler.on_error("invalid type specifier");
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void check_string_type_spec(Char spec, ErrorHandler&& eh) {
-  if (spec != 0 && spec != 's') eh.on_error("invalid type specifier");
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void check_pointer_type_spec(Char spec, ErrorHandler&& eh) {
-  if (spec != 0 && spec != 'p') eh.on_error("invalid type specifier");
-}
-
-template <typename ErrorHandler> class int_type_checker : private ErrorHandler {
- public:
-  FMT_CONSTEXPR explicit int_type_checker(ErrorHandler eh) : ErrorHandler(eh) {}
-
-  FMT_CONSTEXPR void on_dec() {}
-  FMT_CONSTEXPR void on_hex() {}
-  FMT_CONSTEXPR void on_bin() {}
-  FMT_CONSTEXPR void on_oct() {}
-  FMT_CONSTEXPR void on_num() {}
-  FMT_CONSTEXPR void on_chr() {}
-
-  FMT_CONSTEXPR void on_error() {
-    ErrorHandler::on_error("invalid type specifier");
-  }
-};
-
-template <typename ErrorHandler>
-class char_specs_checker : public ErrorHandler {
- private:
-  char type_;
-
- public:
-  FMT_CONSTEXPR char_specs_checker(char type, ErrorHandler eh)
-      : ErrorHandler(eh), type_(type) {}
-
-  FMT_CONSTEXPR void on_int() {
-    handle_int_type_spec(type_, int_type_checker<ErrorHandler>(*this));
-  }
-  FMT_CONSTEXPR void on_char() {}
-};
-
-template <typename ErrorHandler>
-class cstring_type_checker : public ErrorHandler {
- public:
-  FMT_CONSTEXPR explicit cstring_type_checker(ErrorHandler eh)
-      : ErrorHandler(eh) {}
-
-  FMT_CONSTEXPR void on_string() {}
-  FMT_CONSTEXPR void on_pointer() {}
-};
-
 template <typename OutputIt, typename Char>
-FMT_NOINLINE OutputIt fill(OutputIt it, size_t n, const fill_t<Char>& fill) {
+FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n,
+                                     const fill_t<Char>& fill) -> OutputIt {
   auto fill_size = fill.size();
-  if (fill_size == 1) return std::fill_n(it, n, fill[0]);
-  for (size_t i = 0; i < n; ++i) it = std::copy_n(fill.data(), fill_size, it);
+  if (fill_size == 1) return detail::fill_n(it, n, fill[0]);
+  auto data = fill.data();
+  for (size_t i = 0; i < n; ++i)
+    it = copy_str<Char>(data, data + fill_size, it);
   return it;
 }
 
@@ -1492,39 +1327,73 @@ FMT_NOINLINE OutputIt fill(OutputIt it, size_t n, const fill_t<Char>& fill) {
 // width: output display width in (terminal) column positions.
 template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
-inline OutputIt write_padded(OutputIt out,
-                             const basic_format_specs<Char>& specs, size_t size,
-                             size_t width, F&& f) {
+FMT_CONSTEXPR auto write_padded(OutputIt out,
+                                const basic_format_specs<Char>& specs,
+                                size_t size, size_t width, F&& f) -> OutputIt {
   static_assert(align == align::left || align == align::right, "");
   unsigned spec_width = to_unsigned(specs.width);
   size_t padding = spec_width > width ? spec_width - width : 0;
-  auto* shifts = align == align::left ? data::left_padding_shifts
-                                      : data::right_padding_shifts;
+  // Shifts are encoded as string literals because static constexpr is not
+  // supported in constexpr functions.
+  auto* shifts = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
   size_t left_padding = padding >> shifts[specs.align];
+  size_t right_padding = padding - left_padding;
   auto it = reserve(out, size + padding * specs.fill.size());
-  it = fill(it, left_padding, specs.fill);
+  if (left_padding != 0) it = fill(it, left_padding, specs.fill);
   it = f(it);
-  it = fill(it, padding - left_padding, specs.fill);
+  if (right_padding != 0) it = fill(it, right_padding, specs.fill);
   return base_iterator(out, it);
 }
 
 template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
-inline OutputIt write_padded(OutputIt out,
-                             const basic_format_specs<Char>& specs, size_t size,
-                             F&& f) {
+constexpr auto write_padded(OutputIt out, const basic_format_specs<Char>& specs,
+                            size_t size, F&& f) -> OutputIt {
   return write_padded<align>(out, specs, size, size, f);
 }
 
+template <align::type align = align::left, typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
+                               const basic_format_specs<Char>& specs)
+    -> OutputIt {
+  return write_padded<align>(
+      out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
+        const char* data = bytes.data();
+        return copy_str<Char>(data, data + bytes.size(), it);
+      });
+}
+
+template <typename Char, typename OutputIt, typename UIntPtr>
+auto write_ptr(OutputIt out, UIntPtr value,
+               const basic_format_specs<Char>* specs) -> OutputIt {
+  int num_digits = count_digits<4>(value);
+  auto size = to_unsigned(num_digits) + size_t(2);
+  auto write = [=](reserve_iterator<OutputIt> it) {
+    *it++ = static_cast<Char>('0');
+    *it++ = static_cast<Char>('x');
+    return format_uint<4, Char>(it, value, num_digits);
+  };
+  return specs ? write_padded<align::right>(out, *specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+}
+
 template <typename Char, typename OutputIt>
-OutputIt write_bytes(OutputIt out, string_view bytes,
-                     const basic_format_specs<Char>& specs) {
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  return write_padded(out, specs, bytes.size(), [bytes](iterator it) {
-    const char* data = bytes.data();
-    return copy_str<Char>(data, data + bytes.size(), it);
+FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
+                              const basic_format_specs<Char>& specs)
+    -> OutputIt {
+  return write_padded(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+    *it++ = value;
+    return it;
   });
 }
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value,
+                         const basic_format_specs<Char>& specs,
+                         locale_ref loc = {}) -> OutputIt {
+  return check_char_specs(specs)
+             ? write_char(out, value, specs)
+             : write(out, static_cast<int>(value), specs, loc);
+}
 
 // Data for write_int that doesn't depend on output iterator type. It is used to
 // avoid template code bloat.
@@ -1532,9 +1401,9 @@ template <typename Char> struct write_int_data {
   size_t size;
   size_t padding;
 
-  write_int_data(int num_digits, string_view prefix,
-                 const basic_format_specs<Char>& specs)
-      : size(prefix.size() + to_unsigned(num_digits)), padding(0) {
+  FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix,
+                               const basic_format_specs<Char>& specs)
+      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
     if (specs.align == align::numeric) {
       auto width = to_unsigned(specs.width);
       if (width > size) {
@@ -1542,7 +1411,7 @@ template <typename Char> struct write_int_data {
         size = width;
       }
     } else if (specs.precision > num_digits) {
-      size = prefix.size() + to_unsigned(specs.precision);
+      size = (prefix >> 24) + to_unsigned(specs.precision);
       padding = to_unsigned(specs.precision - num_digits);
     }
   }
@@ -1550,183 +1419,280 @@ template <typename Char> struct write_int_data {
 
 // Writes an integer in the format
 //   <left-padding><prefix><numeric-padding><digits><right-padding>
-// where <digits> are written by f(it).
-template <typename OutputIt, typename Char, typename F>
-OutputIt write_int(OutputIt out, int num_digits, string_view prefix,
-                   const basic_format_specs<Char>& specs, F f) {
+// where <digits> are written by write_digits(it).
+// prefix contains chars in three lower bytes and the size in the fourth byte.
+template <typename OutputIt, typename Char, typename W>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
+                                        unsigned prefix,
+                                        const basic_format_specs<Char>& specs,
+                                        W write_digits) -> OutputIt {
+  // Slightly faster check for specs.width == 0 && specs.precision == -1.
+  if ((specs.width | (specs.precision + 1)) == 0) {
+    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
+    if (prefix != 0) {
+      for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+        *it++ = static_cast<Char>(p & 0xff);
+    }
+    return base_iterator(out, write_digits(it));
+  }
   auto data = write_int_data<Char>(num_digits, prefix, specs);
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  return write_padded<align::right>(out, specs, data.size, [=](iterator it) {
-    if (prefix.size() != 0)
-      it = copy_str<Char>(prefix.begin(), prefix.end(), it);
-    it = std::fill_n(it, data.padding, static_cast<Char>('0'));
-    return f(it);
-  });
+  return write_padded<align::right>(
+      out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        it = detail::fill_n(it, data.padding, static_cast<Char>('0'));
+        return write_digits(it);
+      });
 }
 
-template <typename StrChar, typename Char, typename OutputIt>
-OutputIt write(OutputIt out, basic_string_view<StrChar> s,
-               const basic_format_specs<Char>& specs) {
+template <typename Char> class digit_grouping {
+ private:
+  thousands_sep_result<Char> sep_;
+
+  struct next_state {
+    std::string::const_iterator group;
+    int pos;
+  };
+  next_state initial_state() const { return {sep_.grouping.begin(), 0}; }
+
+  // Returns the next digit group separator position.
+  int next(next_state& state) const {
+    if (!sep_.thousands_sep) return max_value<int>();
+    if (state.group == sep_.grouping.end())
+      return state.pos += sep_.grouping.back();
+    if (*state.group <= 0 || *state.group == max_value<char>())
+      return max_value<int>();
+    state.pos += *state.group++;
+    return state.pos;
+  }
+
+ public:
+  explicit digit_grouping(locale_ref loc, bool localized = true) {
+    if (localized)
+      sep_ = thousands_sep<Char>(loc);
+    else
+      sep_.thousands_sep = Char();
+  }
+  explicit digit_grouping(thousands_sep_result<Char> sep) : sep_(sep) {}
+
+  Char separator() const { return sep_.thousands_sep; }
+
+  int count_separators(int num_digits) const {
+    int count = 0;
+    auto state = initial_state();
+    while (num_digits > next(state)) ++count;
+    return count;
+  }
+
+  // Applies grouping to digits and write the output to out.
+  template <typename Out, typename C>
+  Out apply(Out out, basic_string_view<C> digits) const {
+    auto num_digits = static_cast<int>(digits.size());
+    auto separators = basic_memory_buffer<int>();
+    separators.push_back(0);
+    auto state = initial_state();
+    while (int i = next(state)) {
+      if (i >= num_digits) break;
+      separators.push_back(i);
+    }
+    for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
+         i < num_digits; ++i) {
+      if (num_digits - i == separators[sep_index]) {
+        *out++ = separator();
+        --sep_index;
+      }
+      *out++ = static_cast<Char>(digits[to_unsigned(i)]);
+    }
+    return out;
+  }
+};
+
+template <typename OutputIt, typename UInt, typename Char>
+auto write_int_localized(OutputIt out, UInt value, unsigned prefix,
+                         const basic_format_specs<Char>& specs,
+                         const digit_grouping<Char>& grouping) -> OutputIt {
+  static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
+  int num_digits = count_digits(value);
+  char digits[40];
+  format_decimal(digits, value, num_digits);
+  unsigned size = to_unsigned((prefix != 0 ? 1 : 0) + num_digits +
+                              grouping.count_separators(num_digits));
+  return write_padded<align::right>(
+      out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
+        if (prefix != 0) *it++ = static_cast<Char>(prefix);
+        return grouping.apply(it, string_view(digits, to_unsigned(num_digits)));
+      });
+}
+
+template <typename OutputIt, typename UInt, typename Char>
+auto write_int_localized(OutputIt& out, UInt value, unsigned prefix,
+                         const basic_format_specs<Char>& specs, locale_ref loc)
+    -> bool {
+  auto grouping = digit_grouping<Char>(loc);
+  out = write_int_localized(out, value, prefix, specs, grouping);
+  return true;
+}
+
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
+template <typename UInt> struct write_int_arg {
+  UInt abs_value;
+  unsigned prefix;
+};
+
+template <typename T>
+FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
+    -> write_int_arg<uint32_or_64_or_128_t<T>> {
+  auto prefix = 0u;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  if (is_negative(value)) {
+    prefix = 0x01000000 | '-';
+    abs_value = 0 - abs_value;
+  } else {
+    constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
+                                            0x1000000u | ' '};
+    prefix = prefixes[sign];
+  }
+  return {abs_value, prefix};
+}
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
+                                        const basic_format_specs<Char>& specs,
+                                        locale_ref loc) -> OutputIt {
+  static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
+  auto abs_value = arg.abs_value;
+  auto prefix = arg.prefix;
+  switch (specs.type) {
+  case presentation_type::none:
+  case presentation_type::dec: {
+    if (specs.localized &&
+        write_int_localized(out, static_cast<uint64_or_128_t<T>>(abs_value),
+                            prefix, specs, loc)) {
+      return out;
+    }
+    auto num_digits = count_digits(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_decimal<Char>(it, abs_value, num_digits).end;
+        });
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+    int num_digits = count_digits<4>(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<4, Char>(it, abs_value, num_digits, upper);
+        });
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    int num_digits = count_digits<1>(abs_value);
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<1, Char>(it, abs_value, num_digits);
+                     });
+  }
+  case presentation_type::oct: {
+    int num_digits = count_digits<3>(abs_value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && abs_value != 0)
+      prefix_append(prefix, '0');
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<3, Char>(it, abs_value, num_digits);
+                     });
+  }
+  case presentation_type::chr:
+    return write_char(out, static_cast<Char>(abs_value), specs);
+  default:
+    throw_format_error("invalid type specifier");
+  }
+  return out;
+}
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(
+    OutputIt out, write_int_arg<T> arg, const basic_format_specs<Char>& specs,
+    locale_ref loc) -> OutputIt {
+  return write_int(out, arg, specs, loc);
+}
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        std::is_same<OutputIt, buffer_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const basic_format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  return write_int_noinline(out, make_write_int_arg(value, specs.sign), specs,
+                            loc);
+}
+// An inlined version of write used in format string compilation.
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<OutputIt, buffer_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const basic_format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  return write_int(out, make_write_int_arg(value, specs.sign), specs, loc);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
+                         const basic_format_specs<Char>& specs) -> OutputIt {
   auto data = s.data();
   auto size = s.size();
   if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
     size = code_point_index(s, to_unsigned(specs.precision));
-  auto width = specs.width != 0
-                   ? count_code_points(basic_string_view<StrChar>(data, size))
-                   : 0;
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  return write_padded(out, specs, size, width, [=](iterator it) {
-    return copy_str<Char>(data, data + size, it);
-  });
+  auto width =
+      specs.width != 0 ? compute_width(basic_string_view<Char>(data, size)) : 0;
+  return write_padded(out, specs, size, width,
+                      [=](reserve_iterator<OutputIt> it) {
+                        return copy_str<Char>(data, data + size, it);
+                      });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out,
+                         basic_string_view<type_identity_t<Char>> s,
+                         const basic_format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  check_string_type_spec(specs.type);
+  return write(out, s, specs);
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
+                         const basic_format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  return check_cstring_type_spec(specs.type)
+             ? write(out, basic_string_view<Char>(s), specs, {})
+             : write_ptr<Char>(out, to_uintptr(s), &specs);
 }
 
-// The handle_int_type_spec handler that writes an integer.
-template <typename OutputIt, typename Char, typename UInt> struct int_writer {
-  OutputIt out;
-  locale_ref locale;
-  const basic_format_specs<Char>& specs;
-  UInt abs_value;
-  char prefix[4];
-  unsigned prefix_size;
-
-  using iterator =
-      remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
-
-  string_view get_prefix() const { return string_view(prefix, prefix_size); }
-
-  template <typename Int>
-  int_writer(OutputIt output, locale_ref loc, Int value,
-             const basic_format_specs<Char>& s)
-      : out(output),
-        locale(loc),
-        specs(s),
-        abs_value(static_cast<UInt>(value)),
-        prefix_size(0) {
-    static_assert(std::is_same<uint32_or_64_or_128_t<Int>, UInt>::value, "");
-    if (is_negative(value)) {
-      prefix[0] = '-';
-      ++prefix_size;
-      abs_value = 0 - abs_value;
-    } else if (specs.sign != sign::none && specs.sign != sign::minus) {
-      prefix[0] = specs.sign == sign::plus ? '+' : ' ';
-      ++prefix_size;
-    }
-  }
-
-  void on_dec() {
-    auto num_digits = count_digits(abs_value);
-    out = write_int(
-        out, num_digits, get_prefix(), specs, [this, num_digits](iterator it) {
-          return format_decimal<Char>(it, abs_value, num_digits).end;
-        });
-  }
-
-  void on_hex() {
-    if (specs.alt) {
-      prefix[prefix_size++] = '0';
-      prefix[prefix_size++] = specs.type;
-    }
-    int num_digits = count_digits<4>(abs_value);
-    out = write_int(out, num_digits, get_prefix(), specs,
-                    [this, num_digits](iterator it) {
-                      return format_uint<4, Char>(it, abs_value, num_digits,
-                                                  specs.type != 'x');
-                    });
-  }
-
-  void on_bin() {
-    if (specs.alt) {
-      prefix[prefix_size++] = '0';
-      prefix[prefix_size++] = static_cast<char>(specs.type);
-    }
-    int num_digits = count_digits<1>(abs_value);
-    out = write_int(out, num_digits, get_prefix(), specs,
-                    [this, num_digits](iterator it) {
-                      return format_uint<1, Char>(it, abs_value, num_digits);
-                    });
-  }
-
-  void on_oct() {
-    int num_digits = count_digits<3>(abs_value);
-    if (specs.alt && specs.precision <= num_digits && abs_value != 0) {
-      // Octal prefix '0' is counted as a digit, so only add it if precision
-      // is not greater than the number of digits.
-      prefix[prefix_size++] = '0';
-    }
-    out = write_int(out, num_digits, get_prefix(), specs,
-                    [this, num_digits](iterator it) {
-                      return format_uint<3, Char>(it, abs_value, num_digits);
-                    });
-  }
-
-  enum { sep_size = 1 };
-
-  void on_num() {
-    std::string groups = grouping<Char>(locale);
-    if (groups.empty()) return on_dec();
-    auto sep = thousands_sep<Char>(locale);
-    if (!sep) return on_dec();
-    int num_digits = count_digits(abs_value);
-    int size = num_digits, n = num_digits;
-    std::string::const_iterator group = groups.cbegin();
-    while (group != groups.cend() && n > *group && *group > 0 &&
-           *group != max_value<char>()) {
-      size += sep_size;
-      n -= *group;
-      ++group;
-    }
-    if (group == groups.cend()) size += sep_size * ((n - 1) / groups.back());
-    char digits[40];
-    format_decimal(digits, abs_value, num_digits);
-    basic_memory_buffer<Char> buffer;
-    size += static_cast<int>(prefix_size);
-    const auto usize = to_unsigned(size);
-    buffer.resize(usize);
-    basic_string_view<Char> s(&sep, sep_size);
-    // Index of a decimal digit with the least significant digit having index 0.
-    int digit_index = 0;
-    group = groups.cbegin();
-    auto p = buffer.data() + size - 1;
-    for (int i = num_digits - 1; i > 0; --i) {
-      *p-- = static_cast<Char>(digits[i]);
-      if (*group <= 0 || ++digit_index % *group != 0 ||
-          *group == max_value<char>())
-        continue;
-      if (group + 1 != groups.cend()) {
-        digit_index = 0;
-        ++group;
-      }
-      std::uninitialized_copy(s.data(), s.data() + s.size(),
-                              make_checked(p, s.size()));
-      p -= s.size();
-    }
-    *p-- = static_cast<Char>(*digits);
-    if (prefix_size != 0) *p = static_cast<Char>('-');
-    auto data = buffer.data();
-    out = write_padded<align::right>(
-        out, specs, usize, usize,
-        [=](iterator it) { return copy_str<Char>(data, data + size, it); });
-  }
-
-  void on_chr() { *out++ = static_cast<Char>(abs_value); }
-
-  FMT_NORETURN void on_error() {
-    FMT_THROW(format_error("invalid type specifier"));
-  }
-};
-
 template <typename Char, typename OutputIt>
-OutputIt write_nonfinite(OutputIt out, bool isinf,
-                         const basic_format_specs<Char>& specs,
-                         const float_specs& fspecs) {
+FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isinf,
+                                     basic_format_specs<Char> specs,
+                                     const float_specs& fspecs) -> OutputIt {
   auto str =
       isinf ? (fspecs.upper ? "INF" : "inf") : (fspecs.upper ? "NAN" : "nan");
   constexpr size_t str_size = 3;
   auto sign = fspecs.sign;
   auto size = str_size + (sign ? 1 : 0);
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  return write_padded(out, specs, size, [=](iterator it) {
-    if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+  // Replace '0'-padding with space for non-finite values.
+  const bool is_zero_fill =
+      specs.fill.size() == 1 && *specs.fill.data() == static_cast<Char>('0');
+  if (is_zero_fill) specs.fill[0] = static_cast<Char>(' ');
+  return write_padded(out, specs, size, [=](reserve_iterator<OutputIt> it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
     return copy_str<Char>(str, str + str_size, it);
   });
 }
@@ -1738,74 +1704,118 @@ struct big_decimal_fp {
   int exponent;
 };
 
-inline int get_significand_size(const big_decimal_fp& fp) {
+constexpr auto get_significand_size(const big_decimal_fp& fp) -> int {
   return fp.significand_size;
 }
 template <typename T>
-inline int get_significand_size(const dragonbox::decimal_fp<T>& fp) {
+inline auto get_significand_size(const dragonbox::decimal_fp<T>& fp) -> int {
   return count_digits(fp.significand);
 }
 
 template <typename Char, typename OutputIt>
-inline OutputIt write_significand(OutputIt out, const char* significand,
-                                  int& significand_size) {
+constexpr auto write_significand(OutputIt out, const char* significand,
+                                 int significand_size) -> OutputIt {
   return copy_str<Char>(significand, significand + significand_size, out);
 }
 template <typename Char, typename OutputIt, typename UInt>
-inline OutputIt write_significand(OutputIt out, UInt significand,
-                                  int significand_size) {
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size) -> OutputIt {
   return format_decimal<Char>(out, significand, significand_size).end;
 }
+template <typename Char, typename OutputIt, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int exponent,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.separator()) {
+    out = write_significand<Char>(out, significand, significand_size);
+    return detail::fill_n(out, exponent, static_cast<Char>('0'));
+  }
+  auto buffer = memory_buffer();
+  write_significand<char>(appender(buffer), significand, significand_size);
+  detail::fill_n(appender(buffer), exponent, '0');
+  return grouping.apply(out, string_view(buffer.data(), buffer.size()));
+}
 
 template <typename Char, typename UInt,
           FMT_ENABLE_IF(std::is_integral<UInt>::value)>
-inline Char* write_significand(Char* out, UInt significand,
-                               int significand_size, int integral_size,
-                               Char decimal_point) {
+inline auto write_significand(Char* out, UInt significand, int significand_size,
+                              int integral_size, Char decimal_point) -> Char* {
   if (!decimal_point)
     return format_decimal(out, significand, significand_size).end;
-  auto end = format_decimal(out + 1, significand, significand_size).end;
-  if (integral_size == 1)
-    out[0] = out[1];
-  else
-    std::copy_n(out + 1, integral_size, out);
-  out[integral_size] = decimal_point;
+  out += significand_size + 1;
+  Char* end = out;
+  int floating_size = significand_size - integral_size;
+  for (int i = floating_size / 2; i > 0; --i) {
+    out -= 2;
+    copy2(out, digits2(significand % 100));
+    significand /= 100;
+  }
+  if (floating_size % 2 != 0) {
+    *--out = static_cast<Char>('0' + significand % 10);
+    significand /= 10;
+  }
+  *--out = decimal_point;
+  format_decimal(out - integral_size, significand, integral_size);
   return end;
 }
 
 template <typename OutputIt, typename UInt, typename Char,
           FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
-inline OutputIt write_significand(OutputIt out, UInt significand,
-                                  int significand_size, int integral_size,
-                                  Char decimal_point) {
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size, int integral_size,
+                              Char decimal_point) -> OutputIt {
   // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
   Char buffer[digits10<UInt>() + 2];
   auto end = write_significand(buffer, significand, significand_size,
                                integral_size, decimal_point);
-  return detail::copy_str<Char>(buffer, end, out);
+  return detail::copy_str_noinline<Char>(buffer, end, out);
 }
 
 template <typename OutputIt, typename Char>
-inline OutputIt write_significand(OutputIt out, const char* significand,
-                                  int significand_size, int integral_size,
-                                  Char decimal_point) {
-  out = detail::copy_str<Char>(significand, significand + integral_size, out);
+FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
+                                     int significand_size, int integral_size,
+                                     Char decimal_point) -> OutputIt {
+  out = detail::copy_str_noinline<Char>(significand,
+                                        significand + integral_size, out);
   if (!decimal_point) return out;
   *out++ = decimal_point;
-  return detail::copy_str<Char>(significand + integral_size,
-                                significand + significand_size, out);
+  return detail::copy_str_noinline<Char>(significand + integral_size,
+                                         significand + significand_size, out);
 }
 
-template <typename OutputIt, typename DecimalFP, typename Char>
-OutputIt write_float(OutputIt out, const DecimalFP& fp,
-                     const basic_format_specs<Char>& specs, float_specs fspecs,
-                     Char decimal_point) {
+template <typename OutputIt, typename Char, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int integral_size,
+                                       Char decimal_point,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.separator()) {
+    return write_significand(out, significand, significand_size, integral_size,
+                             decimal_point);
+  }
+  auto buffer = basic_memory_buffer<Char>();
+  write_significand(buffer_appender<Char>(buffer), significand,
+                    significand_size, integral_size, decimal_point);
+  grouping.apply(
+      out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
+  return detail::copy_str_noinline<Char>(buffer.data() + integral_size,
+                                         buffer.end(), out);
+}
+
+template <typename OutputIt, typename DecimalFP, typename Char,
+          typename Grouping = digit_grouping<Char>>
+FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& fp,
+                                    const basic_format_specs<Char>& specs,
+                                    float_specs fspecs, locale_ref loc)
+    -> OutputIt {
   auto significand = fp.significand;
   int significand_size = get_significand_size(fp);
-  static const Char zero = static_cast<Char>('0');
+  constexpr Char zero = static_cast<Char>('0');
   auto sign = fspecs.sign;
   size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  using iterator = reserve_iterator<OutputIt>;
+
+  Char decimal_point =
+      fspecs.locale ? detail::decimal_point<Char>(loc) : static_cast<Char>('.');
 
   int output_exp = fp.exponent + significand_size - 1;
   auto use_exp_format = [=]() {
@@ -1820,7 +1830,8 @@ OutputIt write_float(OutputIt out, const DecimalFP& fp,
   if (use_exp_format()) {
     int num_zeros = 0;
     if (fspecs.showpoint) {
-      num_zeros = (std::max)(fspecs.precision - significand_size, 0);
+      num_zeros = fspecs.precision - significand_size;
+      if (num_zeros < 0) num_zeros = 0;
       size += to_unsigned(num_zeros);
     } else if (significand_size == 1) {
       decimal_point = Char();
@@ -1832,11 +1843,11 @@ OutputIt write_float(OutputIt out, const DecimalFP& fp,
     size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
     char exp_char = fspecs.upper ? 'E' : 'e';
     auto write = [=](iterator it) {
-      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      if (sign) *it++ = detail::sign<Char>(sign);
       // Insert a decimal point after the first digit and add an exponent.
       it = write_significand(it, significand, significand_size, 1,
                              decimal_point);
-      if (num_zeros > 0) it = std::fill_n(it, num_zeros, zero);
+      if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero);
       *it++ = static_cast<Char>(exp_char);
       return write_exponent<Char>(output_exp, it);
     };
@@ -1855,25 +1866,29 @@ OutputIt write_float(OutputIt out, const DecimalFP& fp,
 #endif
     if (fspecs.showpoint) {
       if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 1;
-      if (num_zeros > 0) size += to_unsigned(num_zeros);
+      if (num_zeros > 0) size += to_unsigned(num_zeros) + 1;
     }
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(significand_size));
     return write_padded<align::right>(out, specs, size, [&](iterator it) {
-      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
-      it = write_significand<Char>(it, significand, significand_size);
-      it = std::fill_n(it, fp.exponent, zero);
+      if (sign) *it++ = detail::sign<Char>(sign);
+      it = write_significand<Char>(it, significand, significand_size,
+                                   fp.exponent, grouping);
       if (!fspecs.showpoint) return it;
       *it++ = decimal_point;
-      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
     });
   } else if (exp > 0) {
     // 1234e-2 -> 12.34[0+]
     int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
     size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(significand_size));
     return write_padded<align::right>(out, specs, size, [&](iterator it) {
-      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      if (sign) *it++ = detail::sign<Char>(sign);
       it = write_significand(it, significand, significand_size, exp,
-                             decimal_point);
-      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+                             decimal_point, grouping);
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
     });
   }
   // 1234e-6 -> 0.001234
@@ -1882,37 +1897,109 @@ OutputIt write_float(OutputIt out, const DecimalFP& fp,
       fspecs.precision < num_zeros) {
     num_zeros = fspecs.precision;
   }
-  size += 2 + to_unsigned(num_zeros);
+  bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
+  size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
   return write_padded<align::right>(out, specs, size, [&](iterator it) {
-    if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+    if (sign) *it++ = detail::sign<Char>(sign);
     *it++ = zero;
-    if (num_zeros == 0 && significand_size == 0 && !fspecs.showpoint) return it;
+    if (!pointy) return it;
     *it++ = decimal_point;
-    it = std::fill_n(it, num_zeros, zero);
+    it = detail::fill_n(it, num_zeros, zero);
     return write_significand<Char>(it, significand, significand_size);
   });
 }
 
+template <typename Char> class fallback_digit_grouping {
+ public:
+  constexpr fallback_digit_grouping(locale_ref, bool) {}
+
+  constexpr Char separator() const { return Char(); }
+
+  constexpr int count_separators(int) const { return 0; }
+
+  template <typename Out, typename C>
+  constexpr Out apply(Out out, basic_string_view<C>) const {
+    return out;
+  }
+};
+
+template <typename OutputIt, typename DecimalFP, typename Char>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& fp,
+                                 const basic_format_specs<Char>& specs,
+                                 float_specs fspecs, locale_ref loc)
+    -> OutputIt {
+  if (is_constant_evaluated()) {
+    return do_write_float<OutputIt, DecimalFP, Char,
+                          fallback_digit_grouping<Char>>(out, fp, specs, fspecs,
+                                                         loc);
+  } else {
+    return do_write_float(out, fp, specs, fspecs, loc);
+  }
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+FMT_CONSTEXPR20 bool isinf(T value) {
+  if (is_constant_evaluated()) {
+#if defined(__cpp_if_constexpr)
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      constexpr auto significand_bits =
+          dragonbox::float_info<double>::significand_bits;
+      return (bits & exponent_mask<double>()) &&
+             !(bits & ((uint64_t(1) << significand_bits) - 1));
+    }
+#endif
+  }
+  return std::isinf(value);
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+FMT_CONSTEXPR20 bool isfinite(T value) {
+  if (is_constant_evaluated()) {
+#if defined(__cpp_if_constexpr)
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      return (bits & exponent_mask<double>()) != exponent_mask<double>();
+    }
+#endif
+  }
+  return std::isfinite(value);
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
+  if (is_constant_evaluated()) {
+#ifdef __cpp_if_constexpr
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      return (bits & (uint64_t(1) << (num_bits<uint64_t>() - 1))) != 0;
+    }
+#endif
+  }
+  return std::signbit(value);
+}
+
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-OutputIt write(OutputIt out, T value, basic_format_specs<Char> specs,
-               locale_ref loc = {}) {
+FMT_CONSTEXPR20 auto write(OutputIt out, T value,
+                           basic_format_specs<Char> specs, locale_ref loc = {})
+    -> OutputIt {
   if (const_check(!is_supported_floating_point(value))) return out;
   float_specs fspecs = parse_float_type_spec(specs);
   fspecs.sign = specs.sign;
-  if (std::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+  if (detail::signbit(value)) {  // value < 0 is false for NaN so use signbit.
     fspecs.sign = sign::minus;
     value = -value;
   } else if (fspecs.sign == sign::minus) {
     fspecs.sign = sign::none;
   }
 
-  if (!std::isfinite(value))
-    return write_nonfinite(out, std::isinf(value), specs, fspecs);
+  if (!detail::isfinite(value))
+    return write_nonfinite(out, detail::isinf(value), specs, fspecs);
 
   if (specs.align == align::numeric && fspecs.sign) {
     auto it = reserve(out, 1);
-    *it++ = static_cast<Char>(data::signs[fspecs.sign]);
+    *it++ = detail::sign<Char>(fspecs.sign);
     out = base_iterator(out, it);
     fspecs.sign = sign::none;
     if (specs.width != 0) --specs.width;
@@ -1920,30 +2007,35 @@ OutputIt write(OutputIt out, T value, basic_format_specs<Char> specs,
 
   memory_buffer buffer;
   if (fspecs.format == float_format::hex) {
-    if (fspecs.sign) buffer.push_back(data::signs[fspecs.sign]);
+    if (fspecs.sign) buffer.push_back(detail::sign<char>(fspecs.sign));
     snprintf_float(promote_float(value), specs.precision, fspecs, buffer);
-    return write_bytes(out, {buffer.data(), buffer.size()}, specs);
+    return write_bytes<align::right>(out, {buffer.data(), buffer.size()},
+                                     specs);
   }
-  int precision = specs.precision >= 0 || !specs.type ? specs.precision : 6;
+  int precision = specs.precision >= 0 || specs.type == presentation_type::none
+                      ? specs.precision
+                      : 6;
   if (fspecs.format == float_format::exp) {
     if (precision == max_value<int>())
-      FMT_THROW(format_error("number is too big"));
+      throw_format_error("number is too big");
     else
       ++precision;
   }
   if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
-  fspecs.use_grisu = is_fast_float<T>();
+  if (!is_fast_float<T>()) fspecs.fallback = true;
   int exp = format_float(promote_float(value), precision, fspecs, buffer);
   fspecs.precision = precision;
-  Char point =
-      fspecs.locale ? decimal_point<Char>(loc) : static_cast<Char>('.');
   auto fp = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
-  return write_float(out, fp, specs, fspecs, point);
+  return write_float(out, fp, specs, fspecs, loc);
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_fast_float<T>::value)>
-OutputIt write(OutputIt out, T value) {
+FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
+  if (is_constant_evaluated()) {
+    return write(out, value, basic_format_specs<Char>());
+  }
+
   if (const_check(!is_supported_floating_point(value))) return out;
 
   using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
@@ -1951,90 +2043,53 @@ OutputIt write(OutputIt out, T value) {
   auto bits = bit_cast<uint>(value);
 
   auto fspecs = float_specs();
-  auto sign_bit = bits & (uint(1) << (num_bits<uint>() - 1));
-  if (sign_bit != 0) {
+  if (detail::signbit(value)) {
     fspecs.sign = sign::minus;
     value = -value;
   }
 
-  static const auto specs = basic_format_specs<Char>();
+  constexpr auto specs = basic_format_specs<Char>();
   uint mask = exponent_mask<floaty>();
   if ((bits & mask) == mask)
     return write_nonfinite(out, std::isinf(value), specs, fspecs);
 
   auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
-  return write_float(out, dec, specs, fspecs, static_cast<Char>('.'));
+  return write_float(out, dec, specs, fspecs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_floating_point<T>::value &&
                         !is_fast_float<T>::value)>
-inline OutputIt write(OutputIt out, T value) {
+inline auto write(OutputIt out, T value) -> OutputIt {
   return write(out, value, basic_format_specs<Char>());
 }
 
 template <typename Char, typename OutputIt>
-OutputIt write_char(OutputIt out, Char value,
-                    const basic_format_specs<Char>& specs) {
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  return write_padded(out, specs, 1, [=](iterator it) {
-    *it++ = value;
-    return it;
-  });
-}
-
-template <typename Char, typename OutputIt, typename UIntPtr>
-OutputIt write_ptr(OutputIt out, UIntPtr value,
-                   const basic_format_specs<Char>* specs) {
-  int num_digits = count_digits<4>(value);
-  auto size = to_unsigned(num_digits) + size_t(2);
-  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
-  auto write = [=](iterator it) {
-    *it++ = static_cast<Char>('0');
-    *it++ = static_cast<Char>('x');
-    return format_uint<4, Char>(it, value, num_digits);
-  };
-  return specs ? write_padded<align::right>(out, *specs, size, write)
-               : base_iterator(out, write(reserve(out, size)));
-}
-
-template <typename T> struct is_integral : std::is_integral<T> {};
-template <> struct is_integral<int128_t> : std::true_type {};
-template <> struct is_integral<uint128_t> : std::true_type {};
-
-template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, monostate) {
+auto write(OutputIt out, monostate, basic_format_specs<Char> = {},
+           locale_ref = {}) -> OutputIt {
   FMT_ASSERT(false, "");
   return out;
 }
 
-template <typename Char, typename OutputIt,
-          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
-OutputIt write(OutputIt out, string_view value) {
-  auto it = reserve(out, value.size());
-  it = copy_str<Char>(value.begin(), value.end(), it);
-  return base_iterator(out, it);
-}
-
 template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, basic_string_view<Char> value) {
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
+    -> OutputIt {
   auto it = reserve(out, value.size());
-  it = std::copy(value.begin(), value.end(), it);
+  it = copy_str_noinline<Char>(value.begin(), value.end(), it);
   return base_iterator(out, it);
 }
 
-template <typename Char>
-buffer_appender<Char> write(buffer_appender<Char> out,
-                            basic_string_view<Char> value) {
-  get_container(out).append(value.begin(), value.end());
-  return out;
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_string<T>::value)>
+constexpr auto write(OutputIt out, const T& value) -> OutputIt {
+  return write<Char>(out, to_string_view(value));
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
                         !std::is_same<T, Char>::value)>
-OutputIt write(OutputIt out, T value) {
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
   auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
   bool negative = is_negative(value);
   // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
@@ -2052,336 +2107,130 @@ OutputIt write(OutputIt out, T value) {
   return base_iterator(out, it);
 }
 
-template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, bool value) {
-  return write<Char>(out, string_view(value ? "true" : "false"));
+// FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
+template <
+    typename Char, typename OutputIt, typename T,
+    bool check =
+        std::is_enum<T>::value && !std::is_same<T, Char>::value &&
+        mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value !=
+            type::custom_type,
+    FMT_ENABLE_IF(check)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(
+      out, static_cast<typename std::underlying_type<T>::type>(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, bool>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value,
+                         const basic_format_specs<Char>& specs = {},
+                         locale_ref = {}) -> OutputIt {
+  return specs.type != presentation_type::none &&
+                 specs.type != presentation_type::string
+             ? write(out, value ? 1 : 0, specs, {})
+             : write_bytes(out, value ? "true" : "false", specs);
 }
 
 template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, Char value) {
+FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
   auto it = reserve(out, 1);
   *it++ = value;
   return base_iterator(out, it);
 }
 
 template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, const Char* value) {
+FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char* value)
+    -> OutputIt {
   if (!value) {
-    FMT_THROW(format_error("string pointer is null"));
+    throw_format_error("string pointer is null");
   } else {
-    auto length = std::char_traits<Char>::length(value);
-    out = write(out, basic_string_view<Char>(value, length));
+    out = write(out, basic_string_view<Char>(value));
   }
   return out;
 }
 
-template <typename Char, typename OutputIt>
-OutputIt write(OutputIt out, const void* value) {
-  return write_ptr<Char>(out, to_uintptr(value), nullptr);
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, void>::value)>
+auto write(OutputIt out, const T* value,
+           const basic_format_specs<Char>& specs = {}, locale_ref = {})
+    -> OutputIt {
+  check_pointer_type_spec(specs.type, error_handler());
+  return write_ptr<Char>(out, to_uintptr(value), &specs);
 }
 
-template <typename Char, typename OutputIt, typename T>
-auto write(OutputIt out, const T& value) -> typename std::enable_if<
-    mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value ==
-        type::custom_type,
-    OutputIt>::type {
-  using context_type = basic_format_context<OutputIt, Char>;
+// A write overload that handles implicit conversions.
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
+    std::is_class<T>::value && !is_string<T>::value &&
+        !std::is_same<T, Char>::value &&
+        !std::is_same<const T&,
+                      decltype(arg_mapper<Context>().map(value))>::value,
+    OutputIt> {
+  return write<Char>(out, arg_mapper<Context>().map(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value)
+    -> enable_if_t<mapped_type_constant<T, Context>::value == type::custom_type,
+                   OutputIt> {
   using formatter_type =
-      conditional_t<has_formatter<T, context_type>::value,
-                    typename context_type::template formatter_type<T>,
+      conditional_t<has_formatter<T, Context>::value,
+                    typename Context::template formatter_type<T>,
                     fallback_formatter<T, Char>>;
-  context_type ctx(out, {}, {});
+  auto ctx = Context(out, {}, {});
   return formatter_type().format(value, ctx);
 }
 
 // An argument visitor that formats the argument and writes it via the output
 // iterator. It's a class and not a generic lambda for compatibility with C++11.
-template <typename OutputIt, typename Char> struct default_arg_formatter {
-  using context = basic_format_context<OutputIt, Char>;
+template <typename Char> struct default_arg_formatter {
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
 
-  OutputIt out;
+  iterator out;
   basic_format_args<context> args;
   locale_ref loc;
 
-  template <typename T> OutputIt operator()(T value) {
+  template <typename T> auto operator()(T value) -> iterator {
     return write<Char>(out, value);
   }
-
-  OutputIt operator()(typename basic_format_arg<context>::handle handle) {
+  auto operator()(typename basic_format_arg<context>::handle h) -> iterator {
     basic_format_parse_context<Char> parse_ctx({});
-    basic_format_context<OutputIt, Char> format_ctx(out, args, loc);
-    handle.format(parse_ctx, format_ctx);
+    context format_ctx(out, args, loc);
+    h.format(parse_ctx, format_ctx);
     return format_ctx.out();
   }
 };
 
-template <typename OutputIt, typename Char,
-          typename ErrorHandler = error_handler>
-class arg_formatter_base {
- public:
-  using iterator = OutputIt;
-  using char_type = Char;
-  using format_specs = basic_format_specs<Char>;
+template <typename Char> struct arg_formatter {
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
 
- private:
-  iterator out_;
-  locale_ref locale_;
-  format_specs* specs_;
+  iterator out;
+  const basic_format_specs<Char>& specs;
+  locale_ref locale;
 
-  // Attempts to reserve space for n extra characters in the output range.
-  // Returns a pointer to the reserved range or a reference to out_.
-  auto reserve(size_t n) -> decltype(detail::reserve(out_, n)) {
-    return detail::reserve(out_, n);
+  template <typename T>
+  FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
+    return detail::write(out, value, specs, locale);
   }
-
-  using reserve_iterator = remove_reference_t<decltype(
-      detail::reserve(std::declval<iterator&>(), 0))>;
-
-  template <typename T> void write_int(T value, const format_specs& spec) {
-    using uint_type = uint32_or_64_or_128_t<T>;
-    int_writer<iterator, Char, uint_type> w(out_, locale_, value, spec);
-    handle_int_type_spec(spec.type, w);
-    out_ = w.out;
-  }
-
-  void write(char value) {
-    auto&& it = reserve(1);
-    *it++ = value;
-  }
-
-  template <typename Ch, FMT_ENABLE_IF(std::is_same<Ch, Char>::value)>
-  void write(Ch value) {
-    out_ = detail::write<Char>(out_, value);
-  }
-
-  void write(string_view value) {
-    auto&& it = reserve(value.size());
-    it = copy_str<Char>(value.begin(), value.end(), it);
-  }
-  void write(wstring_view value) {
-    static_assert(std::is_same<Char, wchar_t>::value, "");
-    auto&& it = reserve(value.size());
-    it = std::copy(value.begin(), value.end(), it);
-  }
-
-  template <typename Ch>
-  void write(const Ch* s, size_t size, const format_specs& specs) {
-    auto width = specs.width != 0
-                     ? count_code_points(basic_string_view<Ch>(s, size))
-                     : 0;
-    out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) {
-      return copy_str<Char>(s, s + size, it);
-    });
-  }
-
-  template <typename Ch>
-  void write(basic_string_view<Ch> s, const format_specs& specs = {}) {
-    out_ = detail::write(out_, s, specs);
-  }
-
-  void write_pointer(const void* p) {
-    out_ = write_ptr<char_type>(out_, to_uintptr(p), specs_);
-  }
-
-  struct char_spec_handler : ErrorHandler {
-    arg_formatter_base& formatter;
-    Char value;
-
-    char_spec_handler(arg_formatter_base& f, Char val)
-        : formatter(f), value(val) {}
-
-    void on_int() {
-      // char is only formatted as int if there are specs.
-      formatter.write_int(static_cast<int>(value), *formatter.specs_);
-    }
-    void on_char() {
-      if (formatter.specs_)
-        formatter.out_ = write_char(formatter.out_, value, *formatter.specs_);
-      else
-        formatter.write(value);
-    }
-  };
-
-  struct cstring_spec_handler : error_handler {
-    arg_formatter_base& formatter;
-    const Char* value;
-
-    cstring_spec_handler(arg_formatter_base& f, const Char* val)
-        : formatter(f), value(val) {}
-
-    void on_string() { formatter.write(value); }
-    void on_pointer() { formatter.write_pointer(value); }
-  };
-
- protected:
-  iterator out() { return out_; }
-  format_specs* specs() { return specs_; }
-
-  void write(bool value) {
-    if (specs_)
-      write(string_view(value ? "true" : "false"), *specs_);
-    else
-      out_ = detail::write<Char>(out_, value);
-  }
-
-  void write(const Char* value) {
-    if (!value) {
-      FMT_THROW(format_error("string pointer is null"));
-    } else {
-      auto length = std::char_traits<char_type>::length(value);
-      basic_string_view<char_type> sv(value, length);
-      specs_ ? write(sv, *specs_) : write(sv);
-    }
-  }
-
- public:
-  arg_formatter_base(OutputIt out, format_specs* s, locale_ref loc)
-      : out_(out), locale_(loc), specs_(s) {}
-
-  iterator operator()(monostate) {
-    FMT_ASSERT(false, "invalid argument type");
-    return out_;
-  }
-
-  template <typename T, FMT_ENABLE_IF(is_integral<T>::value)>
-  FMT_INLINE iterator operator()(T value) {
-    if (specs_)
-      write_int(value, *specs_);
-    else
-      out_ = detail::write<Char>(out_, value);
-    return out_;
-  }
-
-  iterator operator()(Char value) {
-    handle_char_specs(specs_,
-                      char_spec_handler(*this, static_cast<Char>(value)));
-    return out_;
-  }
-
-  iterator operator()(bool value) {
-    if (specs_ && specs_->type) return (*this)(value ? 1 : 0);
-    write(value != 0);
-    return out_;
-  }
-
-  template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-  iterator operator()(T value) {
-    auto specs = specs_ ? *specs_ : format_specs();
-    if (const_check(is_supported_floating_point(value)))
-      out_ = detail::write(out_, value, specs, locale_);
-    else
-      FMT_ASSERT(false, "unsupported float argument type");
-    return out_;
-  }
-
-  iterator operator()(const Char* value) {
-    if (!specs_) return write(value), out_;
-    handle_cstring_type_spec(specs_->type, cstring_spec_handler(*this, value));
-    return out_;
-  }
-
-  iterator operator()(basic_string_view<Char> value) {
-    if (specs_) {
-      check_string_type_spec(specs_->type, error_handler());
-      write(value, *specs_);
-    } else {
-      write(value);
-    }
-    return out_;
-  }
-
-  iterator operator()(const void* value) {
-    if (specs_) check_pointer_type_spec(specs_->type, error_handler());
-    write_pointer(value);
-    return out_;
+  auto operator()(typename basic_format_arg<context>::handle) -> iterator {
+    // User-defined types are handled separately because they require access
+    // to the parse context.
+    return out;
   }
 };
 
-/** The default argument formatter. */
-template <typename OutputIt, typename Char>
-class arg_formatter : public arg_formatter_base<OutputIt, Char> {
- private:
-  using char_type = Char;
-  using base = arg_formatter_base<OutputIt, Char>;
-  using context_type = basic_format_context<OutputIt, Char>;
+template <typename Char> struct custom_formatter {
+  basic_format_parse_context<Char>& parse_ctx;
+  buffer_context<Char>& ctx;
 
-  context_type& ctx_;
-  basic_format_parse_context<char_type>* parse_ctx_;
-  const Char* ptr_;
-
- public:
-  using iterator = typename base::iterator;
-  using format_specs = typename base::format_specs;
-
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *ctx* is a reference to the formatting context,
-    *specs* contains format specifier information for standard argument types.
-    \endrst
-   */
-  explicit arg_formatter(
-      context_type& ctx,
-      basic_format_parse_context<char_type>* parse_ctx = nullptr,
-      format_specs* specs = nullptr, const Char* ptr = nullptr)
-      : base(ctx.out(), specs, ctx.locale()),
-        ctx_(ctx),
-        parse_ctx_(parse_ctx),
-        ptr_(ptr) {}
-
-  using base::operator();
-
-  /** Formats an argument of a user-defined type. */
-  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
-    if (ptr_) advance_to(*parse_ctx_, ptr_);
-    handle.format(*parse_ctx_, ctx_);
-    return ctx_.out();
+  void operator()(
+      typename basic_format_arg<buffer_context<Char>>::handle h) const {
+    h.format(parse_ctx, ctx);
   }
-};
-
-template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
-  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
-}
-
-// Parses the range [begin, end) as an unsigned integer. This function assumes
-// that the range is non-empty and the first character is a digit.
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR int parse_nonnegative_int(const Char*& begin, const Char* end,
-                                        ErrorHandler&& eh) {
-  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
-  unsigned value = 0;
-  // Convert to unsigned to prevent a warning.
-  constexpr unsigned max_int = max_value<int>();
-  unsigned big = max_int / 10;
-  do {
-    // Check for overflow.
-    if (value > big) {
-      value = max_int + 1;
-      break;
-    }
-    value = value * 10 + unsigned(*begin - '0');
-    ++begin;
-  } while (begin != end && '0' <= *begin && *begin <= '9');
-  if (value > max_int) eh.on_error("number is too big");
-  return static_cast<int>(value);
-}
-
-template <typename Context> class custom_formatter {
- private:
-  using char_type = typename Context::char_type;
-
-  basic_format_parse_context<char_type>& parse_ctx_;
-  Context& ctx_;
-
- public:
-  explicit custom_formatter(basic_format_parse_context<char_type>& parse_ctx,
-                            Context& ctx)
-      : parse_ctx_(parse_ctx), ctx_(ctx) {}
-
-  void operator()(typename basic_format_arg<Context>::handle h) const {
-    h.format(parse_ctx_, ctx_);
-  }
-
   template <typename T> void operator()(T) const {}
 };
 
@@ -2396,13 +2245,13 @@ template <typename ErrorHandler> class width_checker {
   explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {}
 
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T value) {
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
     if (is_negative(value)) handler_.on_error("negative width");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T) {
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
     handler_.on_error("width is not integer");
     return 0;
   }
@@ -2416,13 +2265,13 @@ template <typename ErrorHandler> class precision_checker {
   explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {}
 
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T value) {
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
     if (is_negative(value)) handler_.on_error("negative precision");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T) {
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
     handler_.on_error("precision is not integer");
     return 0;
   }
@@ -2431,148 +2280,50 @@ template <typename ErrorHandler> class precision_checker {
   ErrorHandler& handler_;
 };
 
-// A format specifier handler that sets fields in basic_format_specs.
-template <typename Char> class specs_setter {
- public:
-  explicit FMT_CONSTEXPR specs_setter(basic_format_specs<Char>& specs)
-      : specs_(specs) {}
-
-  FMT_CONSTEXPR specs_setter(const specs_setter& other)
-      : specs_(other.specs_) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; }
-  FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
-    specs_.fill = fill;
-  }
-  FMT_CONSTEXPR void on_plus() { specs_.sign = sign::plus; }
-  FMT_CONSTEXPR void on_minus() { specs_.sign = sign::minus; }
-  FMT_CONSTEXPR void on_space() { specs_.sign = sign::space; }
-  FMT_CONSTEXPR void on_hash() { specs_.alt = true; }
-
-  FMT_CONSTEXPR void on_zero() {
-    specs_.align = align::numeric;
-    specs_.fill[0] = Char('0');
-  }
-
-  FMT_CONSTEXPR void on_width(int width) { specs_.width = width; }
-  FMT_CONSTEXPR void on_precision(int precision) {
-    specs_.precision = precision;
-  }
-  FMT_CONSTEXPR void end_precision() {}
-
-  FMT_CONSTEXPR void on_type(Char type) {
-    specs_.type = static_cast<char>(type);
-  }
-
- protected:
-  basic_format_specs<Char>& specs_;
-};
-
-template <typename ErrorHandler> class numeric_specs_checker {
- public:
-  FMT_CONSTEXPR numeric_specs_checker(ErrorHandler& eh, detail::type arg_type)
-      : error_handler_(eh), arg_type_(arg_type) {}
-
-  FMT_CONSTEXPR void require_numeric_argument() {
-    if (!is_arithmetic_type(arg_type_))
-      error_handler_.on_error("format specifier requires numeric argument");
-  }
-
-  FMT_CONSTEXPR void check_sign() {
-    require_numeric_argument();
-    if (is_integral_type(arg_type_) && arg_type_ != type::int_type &&
-        arg_type_ != type::long_long_type && arg_type_ != type::char_type) {
-      error_handler_.on_error("format specifier requires signed argument");
-    }
-  }
-
-  FMT_CONSTEXPR void check_precision() {
-    if (is_integral_type(arg_type_) || arg_type_ == type::pointer_type)
-      error_handler_.on_error("precision not allowed for this argument type");
-  }
-
- private:
-  ErrorHandler& error_handler_;
-  detail::type arg_type_;
-};
-
-// A format specifier handler that checks if specifiers are consistent with the
-// argument type.
-template <typename Handler> class specs_checker : public Handler {
- private:
-  numeric_specs_checker<Handler> checker_;
-
-  // Suppress an MSVC warning about using this in initializer list.
-  FMT_CONSTEXPR Handler& error_handler() { return *this; }
-
- public:
-  FMT_CONSTEXPR specs_checker(const Handler& handler, detail::type arg_type)
-      : Handler(handler), checker_(error_handler(), arg_type) {}
-
-  FMT_CONSTEXPR specs_checker(const specs_checker& other)
-      : Handler(other), checker_(error_handler(), other.arg_type_) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) {
-    if (align == align::numeric) checker_.require_numeric_argument();
-    Handler::on_align(align);
-  }
-
-  FMT_CONSTEXPR void on_plus() {
-    checker_.check_sign();
-    Handler::on_plus();
-  }
-
-  FMT_CONSTEXPR void on_minus() {
-    checker_.check_sign();
-    Handler::on_minus();
-  }
-
-  FMT_CONSTEXPR void on_space() {
-    checker_.check_sign();
-    Handler::on_space();
-  }
-
-  FMT_CONSTEXPR void on_hash() {
-    checker_.require_numeric_argument();
-    Handler::on_hash();
-  }
-
-  FMT_CONSTEXPR void on_zero() {
-    checker_.require_numeric_argument();
-    Handler::on_zero();
-  }
-
-  FMT_CONSTEXPR void end_precision() { checker_.check_precision(); }
-};
-
 template <template <typename> class Handler, typename FormatArg,
           typename ErrorHandler>
-FMT_CONSTEXPR int get_dynamic_spec(FormatArg arg, ErrorHandler eh) {
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg, ErrorHandler eh) -> int {
   unsigned long long value = visit_format_arg(Handler<ErrorHandler>(eh), arg);
   if (value > to_unsigned(max_value<int>())) eh.on_error("number is too big");
   return static_cast<int>(value);
 }
 
-struct auto_id {};
-
 template <typename Context, typename ID>
-FMT_CONSTEXPR typename Context::format_arg get_arg(Context& ctx, ID id) {
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) ->
+    typename Context::format_arg {
   auto arg = ctx.arg(id);
   if (!arg) ctx.on_error("argument not found");
   return arg;
 }
 
 // The standard format specifier handler with checking.
-template <typename ParseContext, typename Context>
-class specs_handler : public specs_setter<typename Context::char_type> {
- public:
-  using char_type = typename Context::char_type;
+template <typename Char> class specs_handler : public specs_setter<Char> {
+ private:
+  basic_format_parse_context<Char>& parse_context_;
+  buffer_context<Char>& context_;
 
-  FMT_CONSTEXPR specs_handler(basic_format_specs<char_type>& specs,
-                              ParseContext& parse_ctx, Context& ctx)
-      : specs_setter<char_type>(specs),
-        parse_context_(parse_ctx),
-        context_(ctx) {}
+  // This is only needed for compatibility with gcc 4.4.
+  using format_arg = basic_format_arg<buffer_context<Char>>;
+
+  FMT_CONSTEXPR auto get_arg(auto_id) -> format_arg {
+    return detail::get_arg(context_, parse_context_.next_arg_id());
+  }
+
+  FMT_CONSTEXPR auto get_arg(int arg_id) -> format_arg {
+    parse_context_.check_arg_id(arg_id);
+    return detail::get_arg(context_, arg_id);
+  }
+
+  FMT_CONSTEXPR auto get_arg(basic_string_view<Char> arg_id) -> format_arg {
+    parse_context_.check_arg_id(arg_id);
+    return detail::get_arg(context_, arg_id);
+  }
+
+ public:
+  FMT_CONSTEXPR specs_handler(basic_format_specs<Char>& specs,
+                              basic_format_parse_context<Char>& parse_ctx,
+                              buffer_context<Char>& ctx)
+      : specs_setter<Char>(specs), parse_context_(parse_ctx), context_(ctx) {}
 
   template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
     this->specs_.width = get_dynamic_spec<width_checker>(
@@ -2585,653 +2336,12 @@ class specs_handler : public specs_setter<typename Context::char_type> {
   }
 
   void on_error(const char* message) { context_.on_error(message); }
-
- private:
-  // This is only needed for compatibility with gcc 4.4.
-  using format_arg = typename Context::format_arg;
-
-  FMT_CONSTEXPR format_arg get_arg(auto_id) {
-    return detail::get_arg(context_, parse_context_.next_arg_id());
-  }
-
-  FMT_CONSTEXPR format_arg get_arg(int arg_id) {
-    parse_context_.check_arg_id(arg_id);
-    return detail::get_arg(context_, arg_id);
-  }
-
-  FMT_CONSTEXPR format_arg get_arg(basic_string_view<char_type> arg_id) {
-    parse_context_.check_arg_id(arg_id);
-    return detail::get_arg(context_, arg_id);
-  }
-
-  ParseContext& parse_context_;
-  Context& context_;
 };
 
-enum class arg_id_kind { none, index, name };
-
-// An argument reference.
-template <typename Char> struct arg_ref {
-  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
-
-  FMT_CONSTEXPR explicit arg_ref(int index)
-      : kind(arg_id_kind::index), val(index) {}
-  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
-      : kind(arg_id_kind::name), val(name) {}
-
-  FMT_CONSTEXPR arg_ref& operator=(int idx) {
-    kind = arg_id_kind::index;
-    val.index = idx;
-    return *this;
-  }
-
-  arg_id_kind kind;
-  union value {
-    FMT_CONSTEXPR value(int id = 0) : index{id} {}
-    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
-
-    int index;
-    basic_string_view<Char> name;
-  } val;
-};
-
-// Format specifiers with width and precision resolved at formatting rather
-// than parsing time to allow re-using the same parsed specifiers with
-// different sets of arguments (precompilation of format strings).
-template <typename Char>
-struct dynamic_format_specs : basic_format_specs<Char> {
-  arg_ref<Char> width_ref;
-  arg_ref<Char> precision_ref;
-};
-
-// Format spec handler that saves references to arguments representing dynamic
-// width and precision to be resolved at formatting time.
-template <typename ParseContext>
-class dynamic_specs_handler
-    : public specs_setter<typename ParseContext::char_type> {
- public:
-  using char_type = typename ParseContext::char_type;
-
-  FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs<char_type>& specs,
-                                      ParseContext& ctx)
-      : specs_setter<char_type>(specs), specs_(specs), context_(ctx) {}
-
-  FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other)
-      : specs_setter<char_type>(other),
-        specs_(other.specs_),
-        context_(other.context_) {}
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-    specs_.width_ref = make_arg_ref(arg_id);
-  }
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-    specs_.precision_ref = make_arg_ref(arg_id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-
- private:
-  using arg_ref_type = arg_ref<char_type>;
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(int arg_id) {
-    context_.check_arg_id(arg_id);
-    return arg_ref_type(arg_id);
-  }
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(auto_id) {
-    return arg_ref_type(context_.next_arg_id());
-  }
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view<char_type> arg_id) {
-    context_.check_arg_id(arg_id);
-    basic_string_view<char_type> format_str(
-        context_.begin(), to_unsigned(context_.end() - context_.begin()));
-    return arg_ref_type(arg_id);
-  }
-
-  dynamic_format_specs<char_type>& specs_;
-  ParseContext& context_;
-};
-
-template <typename Char, typename IDHandler>
-FMT_CONSTEXPR const Char* parse_arg_id(const Char* begin, const Char* end,
-                                       IDHandler&& handler) {
-  FMT_ASSERT(begin != end, "");
-  Char c = *begin;
-  if (c == '}' || c == ':') {
-    handler();
-    return begin;
-  }
-  if (c >= '0' && c <= '9') {
-    int index = 0;
-    if (c != '0')
-      index = parse_nonnegative_int(begin, end, handler);
-    else
-      ++begin;
-    if (begin == end || (*begin != '}' && *begin != ':'))
-      handler.on_error("invalid format string");
-    else
-      handler(index);
-    return begin;
-  }
-  if (!is_name_start(c)) {
-    handler.on_error("invalid format string");
-    return begin;
-  }
-  auto it = begin;
-  do {
-    ++it;
-  } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9')));
-  handler(basic_string_view<Char>(begin, to_unsigned(it - begin)));
-  return it;
-}
-
-// Adapts SpecHandler to IDHandler API for dynamic width.
-template <typename SpecHandler, typename Char> struct width_adapter {
-  explicit FMT_CONSTEXPR width_adapter(SpecHandler& h) : handler(h) {}
-
-  FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); }
-  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    handler.on_dynamic_width(id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-
-  SpecHandler& handler;
-};
-
-// Adapts SpecHandler to IDHandler API for dynamic precision.
-template <typename SpecHandler, typename Char> struct precision_adapter {
-  explicit FMT_CONSTEXPR precision_adapter(SpecHandler& h) : handler(h) {}
-
-  FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); }
-  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    handler.on_dynamic_precision(id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-
-  SpecHandler& handler;
-};
-
-template <typename Char>
-FMT_CONSTEXPR int code_point_length(const Char* begin) {
-  if (const_check(sizeof(Char) != 1)) return 1;
-  constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                              0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
-  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
-
-  // Compute the pointer to the next character early so that the next
-  // iteration can start working on the next character. Neither Clang
-  // nor GCC figure out this reordering on their own.
-  return len + !len;
-}
-
-template <typename Char> constexpr bool is_ascii_letter(Char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
-// Converts a character to ASCII. Returns a number > 127 on conversion failure.
-template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
-constexpr Char to_ascii(Char value) {
-  return value;
-}
-template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
-constexpr typename std::underlying_type<Char>::type to_ascii(Char value) {
-  return value;
-}
-
-// Parses fill and alignment.
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_align(const Char* begin, const Char* end,
-                                      Handler&& handler) {
-  FMT_ASSERT(begin != end, "");
-  auto align = align::none;
-  auto p = begin + code_point_length(begin);
-  if (p >= end) p = begin;
-  for (;;) {
-    switch (to_ascii(*p)) {
-    case '<':
-      align = align::left;
-      break;
-    case '>':
-      align = align::right;
-      break;
-#if FMT_DEPRECATED_NUMERIC_ALIGN
-    case '=':
-      align = align::numeric;
-      break;
-#endif
-    case '^':
-      align = align::center;
-      break;
-    }
-    if (align != align::none) {
-      if (p != begin) {
-        auto c = *begin;
-        if (c == '{')
-          return handler.on_error("invalid fill character '{'"), begin;
-        handler.on_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
-        begin = p + 1;
-      } else
-        ++begin;
-      handler.on_align(align);
-      break;
-    } else if (p == begin) {
-      break;
-    }
-    p = begin;
-  }
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_width(const Char* begin, const Char* end,
-                                      Handler&& handler) {
-  FMT_ASSERT(begin != end, "");
-  if ('0' <= *begin && *begin <= '9') {
-    handler.on_width(parse_nonnegative_int(begin, end, handler));
-  } else if (*begin == '{') {
-    ++begin;
-    if (begin != end)
-      begin = parse_arg_id(begin, end, width_adapter<Handler, Char>(handler));
-    if (begin == end || *begin != '}')
-      return handler.on_error("invalid format string"), begin;
-    ++begin;
-  }
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_precision(const Char* begin, const Char* end,
-                                          Handler&& handler) {
-  ++begin;
-  auto c = begin != end ? *begin : Char();
-  if ('0' <= c && c <= '9') {
-    handler.on_precision(parse_nonnegative_int(begin, end, handler));
-  } else if (c == '{') {
-    ++begin;
-    if (begin != end) {
-      begin =
-          parse_arg_id(begin, end, precision_adapter<Handler, Char>(handler));
-    }
-    if (begin == end || *begin++ != '}')
-      return handler.on_error("invalid format string"), begin;
-  } else {
-    return handler.on_error("missing precision specifier"), begin;
-  }
-  handler.end_precision();
-  return begin;
-}
-
-// Parses standard format specifiers and sends notifications about parsed
-// components to handler.
-template <typename Char, typename SpecHandler>
-FMT_CONSTEXPR const Char* parse_format_specs(const Char* begin, const Char* end,
-                                             SpecHandler&& handler) {
-  if (begin == end) return begin;
-
-  begin = parse_align(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse sign.
-  switch (to_ascii(*begin)) {
-  case '+':
-    handler.on_plus();
-    ++begin;
-    break;
-  case '-':
-    handler.on_minus();
-    ++begin;
-    break;
-  case ' ':
-    handler.on_space();
-    ++begin;
-    break;
-  }
-  if (begin == end) return begin;
-
-  if (*begin == '#') {
-    handler.on_hash();
-    if (++begin == end) return begin;
-  }
-
-  // Parse zero flag.
-  if (*begin == '0') {
-    handler.on_zero();
-    if (++begin == end) return begin;
-  }
-
-  begin = parse_width(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse precision.
-  if (*begin == '.') {
-    begin = parse_precision(begin, end, handler);
-  }
-
-  // Parse type.
-  if (begin != end && *begin != '}') handler.on_type(*begin++);
-  return begin;
-}
-
-// Return the result via the out param to workaround gcc bug 77539.
-template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
-FMT_CONSTEXPR bool find(Ptr first, Ptr last, T value, Ptr& out) {
-  for (out = first; out != last; ++out) {
-    if (*out == value) return true;
-  }
-  return false;
-}
-
-template <>
-inline bool find<false, char>(const char* first, const char* last, char value,
-                              const char*& out) {
-  out = static_cast<const char*>(
-      std::memchr(first, value, detail::to_unsigned(last - first)));
-  return out != nullptr;
-}
-
-template <typename Handler, typename Char> struct id_adapter {
-  Handler& handler;
-  int arg_id;
-
-  FMT_CONSTEXPR void operator()() { arg_id = handler.on_arg_id(); }
-  FMT_CONSTEXPR void operator()(int id) { arg_id = handler.on_arg_id(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    arg_id = handler.on_arg_id(id);
-  }
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-};
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_replacement_field(const Char* begin,
-                                                  const Char* end,
-                                                  Handler&& handler) {
-  ++begin;
-  if (begin == end) return handler.on_error("invalid format string"), end;
-  if (*begin == '}') {
-    handler.on_replacement_field(handler.on_arg_id(), begin);
-  } else if (*begin == '{') {
-    handler.on_text(begin, begin + 1);
-  } else {
-    auto adapter = id_adapter<Handler, Char>{handler, 0};
-    begin = parse_arg_id(begin, end, adapter);
-    Char c = begin != end ? *begin : Char();
-    if (c == '}') {
-      handler.on_replacement_field(adapter.arg_id, begin);
-    } else if (c == ':') {
-      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
-      if (begin == end || *begin != '}')
-        return handler.on_error("unknown format specifier"), end;
-    } else {
-      return handler.on_error("missing '}' in format string"), end;
-    }
-  }
-  return begin + 1;
-}
-
-template <bool IS_CONSTEXPR, typename Char, typename Handler>
-FMT_CONSTEXPR_DECL FMT_INLINE void parse_format_string(
-    basic_string_view<Char> format_str, Handler&& handler) {
-  auto begin = format_str.data();
-  auto end = begin + format_str.size();
-  if (end - begin < 32) {
-    // Use a simple loop instead of memchr for small strings.
-    const Char* p = begin;
-    while (p != end) {
-      auto c = *p++;
-      if (c == '{') {
-        handler.on_text(begin, p - 1);
-        begin = p = parse_replacement_field(p - 1, end, handler);
-      } else if (c == '}') {
-        if (p == end || *p != '}')
-          return handler.on_error("unmatched '}' in format string");
-        handler.on_text(begin, p);
-        begin = ++p;
-      }
-    }
-    handler.on_text(begin, end);
-    return;
-  }
-  struct writer {
-    FMT_CONSTEXPR void operator()(const Char* pbegin, const Char* pend) {
-      if (pbegin == pend) return;
-      for (;;) {
-        const Char* p = nullptr;
-        if (!find<IS_CONSTEXPR>(pbegin, pend, '}', p))
-          return handler_.on_text(pbegin, pend);
-        ++p;
-        if (p == pend || *p != '}')
-          return handler_.on_error("unmatched '}' in format string");
-        handler_.on_text(pbegin, p);
-        pbegin = p + 1;
-      }
-    }
-    Handler& handler_;
-  } write{handler};
-  while (begin != end) {
-    // Doing two passes with memchr (one for '{' and another for '}') is up to
-    // 2.5x faster than the naive one-pass implementation on big format strings.
-    const Char* p = begin;
-    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, '{', p))
-      return write(begin, end);
-    write(begin, p);
-    begin = parse_replacement_field(p, end, handler);
-  }
-}
-
-template <typename T, typename ParseContext>
-FMT_CONSTEXPR const typename ParseContext::char_type* parse_format_specs(
-    ParseContext& ctx) {
-  using char_type = typename ParseContext::char_type;
-  using context = buffer_context<char_type>;
-  using mapped_type =
-      conditional_t<detail::mapped_type_constant<T, context>::value !=
-                        type::custom_type,
-                    decltype(arg_mapper<context>().map(std::declval<T>())), T>;
-  auto f = conditional_t<has_formatter<mapped_type, context>::value,
-                         formatter<mapped_type, char_type>,
-                         detail::fallback_formatter<T, char_type>>();
-  return f.parse(ctx);
-}
-
-template <typename OutputIt, typename Char, typename Context>
-struct format_handler : detail::error_handler {
-  basic_format_parse_context<Char> parse_context;
-  Context context;
-
-  format_handler(OutputIt out, basic_string_view<Char> str,
-                 basic_format_args<Context> format_args, detail::locale_ref loc)
-      : parse_context(str), context(out, format_args, loc) {}
-
-  void on_text(const Char* begin, const Char* end) {
-    auto size = to_unsigned(end - begin);
-    auto out = context.out();
-    auto&& it = reserve(out, size);
-    it = std::copy_n(begin, size, it);
-    context.advance_to(out);
-  }
-
-  int on_arg_id() { return parse_context.next_arg_id(); }
-  int on_arg_id(int id) { return parse_context.check_arg_id(id), id; }
-  int on_arg_id(basic_string_view<Char> id) {
-    int arg_id = context.arg_id(id);
-    if (arg_id < 0) on_error("argument not found");
-    return arg_id;
-  }
-
-  FMT_INLINE void on_replacement_field(int id, const Char*) {
-    auto arg = get_arg(context, id);
-    context.advance_to(visit_format_arg(
-        default_arg_formatter<OutputIt, Char>{context.out(), context.args(),
-                                              context.locale()},
-        arg));
-  }
-
-  const Char* on_format_specs(int id, const Char* begin, const Char* end) {
-    auto arg = get_arg(context, id);
-    if (arg.type() == type::custom_type) {
-      advance_to(parse_context, begin);
-      visit_format_arg(custom_formatter<Context>(parse_context, context), arg);
-      return parse_context.begin();
-    }
-    auto specs = basic_format_specs<Char>();
-    if (begin + 1 < end && begin[1] == '}' && is_ascii_letter(*begin)) {
-      specs.type = static_cast<char>(*begin++);
-    } else {
-      using parse_context_t = basic_format_parse_context<Char>;
-      specs_checker<specs_handler<parse_context_t, Context>> handler(
-          specs_handler<parse_context_t, Context>(specs, parse_context,
-                                                  context),
-          arg.type());
-      begin = parse_format_specs(begin, end, handler);
-      if (begin == end || *begin != '}')
-        on_error("missing '}' in format string");
-    }
-    context.advance_to(visit_format_arg(
-        arg_formatter<OutputIt, Char>(context, &parse_context, &specs), arg));
-    return begin;
-  }
-};
-
-// A parse context with extra argument id checks. It is only used at compile
-// time because adding checks at runtime would introduce substantial overhead
-// and would be redundant since argument ids are checked when arguments are
-// retrieved anyway.
-template <typename Char, typename ErrorHandler = error_handler>
-class compile_parse_context
-    : public basic_format_parse_context<Char, ErrorHandler> {
- private:
-  int num_args_;
-  using base = basic_format_parse_context<Char, ErrorHandler>;
-
- public:
-  explicit FMT_CONSTEXPR compile_parse_context(
-      basic_string_view<Char> format_str, int num_args = max_value<int>(),
-      ErrorHandler eh = {})
-      : base(format_str, eh), num_args_(num_args) {}
-
-  FMT_CONSTEXPR int next_arg_id() {
-    int id = base::next_arg_id();
-    if (id >= num_args_) this->on_error("argument not found");
-    return id;
-  }
-
-  FMT_CONSTEXPR void check_arg_id(int id) {
-    base::check_arg_id(id);
-    if (id >= num_args_) this->on_error("argument not found");
-  }
-  using base::check_arg_id;
-};
-
-template <typename Char, typename ErrorHandler, typename... Args>
-class format_string_checker {
- public:
-  explicit FMT_CONSTEXPR format_string_checker(
-      basic_string_view<Char> format_str, ErrorHandler eh)
-      : context_(format_str, num_args, eh),
-        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
-
-  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-
-  FMT_CONSTEXPR int on_arg_id() { return context_.next_arg_id(); }
-  FMT_CONSTEXPR int on_arg_id(int id) { return context_.check_arg_id(id), id; }
-  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char>) {
-    on_error("compile-time checks don't support named arguments");
-    return 0;
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
-
-  FMT_CONSTEXPR const Char* on_format_specs(int id, const Char* begin,
-                                            const Char*) {
-    advance_to(context_, begin);
-    return id < num_args ? parse_funcs_[id](context_) : begin;
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-
- private:
-  using parse_context_type = compile_parse_context<Char, ErrorHandler>;
-  enum { num_args = sizeof...(Args) };
-
-  // Format specifier parsing function.
-  using parse_func = const Char* (*)(parse_context_type&);
-
-  parse_context_type context_;
-  parse_func parse_funcs_[num_args > 0 ? num_args : 1];
-};
-
-// Converts string literals to basic_string_view.
-template <typename Char, size_t N>
-FMT_CONSTEXPR basic_string_view<Char> compile_string_to_view(
-    const Char (&s)[N]) {
-  // Remove trailing null character if needed. Won't be present if this is used
-  // with raw character array (i.e. not defined as a string).
-  return {s,
-          N - ((std::char_traits<Char>::to_int_type(s[N - 1]) == 0) ? 1 : 0)};
-}
-
-// Converts string_view to basic_string_view.
-template <typename Char>
-FMT_CONSTEXPR basic_string_view<Char> compile_string_to_view(
-    const std_string_view<Char>& s) {
-  return {s.data(), s.size()};
-}
-
-#define FMT_STRING_IMPL(s, base)                                  \
-  [] {                                                            \
-    /* Use a macro-like name to avoid shadowing warnings. */      \
-    struct FMT_COMPILE_STRING : base {                            \
-      using char_type = fmt::remove_cvref_t<decltype(s[0])>;      \
-      FMT_MAYBE_UNUSED FMT_CONSTEXPR                              \
-      operator fmt::basic_string_view<char_type>() const {        \
-        return fmt::detail::compile_string_to_view<char_type>(s); \
-      }                                                           \
-    };                                                            \
-    return FMT_COMPILE_STRING();                                  \
-  }()
-
-/**
-  \rst
-  Constructs a compile-time format string from a string literal *s*.
-
-  **Example**::
-
-    // A compile-time error because 'd' is an invalid specifier for strings.
-    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
-  \endrst
- */
-#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::compile_string)
-
-template <typename... Args, typename S,
-          enable_if_t<(is_compile_string<S>::value), int>>
-void check_format_string(S format_str) {
-  FMT_CONSTEXPR_DECL auto s = to_string_view(format_str);
-  using checker = format_string_checker<typename S::char_type, error_handler,
-                                        remove_cvref_t<Args>...>;
-  FMT_CONSTEXPR_DECL bool invalid_format =
-      (parse_format_string<true>(s, checker(s, {})), true);
-  (void)invalid_format;
-}
-
 template <template <typename> class Handler, typename Context>
-void handle_dynamic_spec(int& value, arg_ref<typename Context::char_type> ref,
-                         Context& ctx) {
+FMT_CONSTEXPR void handle_dynamic_spec(int& value,
+                                       arg_ref<typename Context::char_type> ref,
+                                       Context& ctx) {
   switch (ref.kind) {
   case arg_id_kind::none:
     break;
@@ -3246,89 +2356,146 @@ void handle_dynamic_spec(int& value, arg_ref<typename Context::char_type> ref,
   }
 }
 
-using format_func = void (*)(detail::buffer<char>&, int, string_view);
+#define FMT_STRING_IMPL(s, base, explicit)                                 \
+  [] {                                                                     \
+    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */ \
+    /* Use a macro-like name to avoid shadowing warnings. */               \
+    struct FMT_GCC_VISIBILITY_HIDDEN FMT_COMPILE_STRING : base {           \
+      using char_type = fmt::remove_cvref_t<decltype(s[0])>;               \
+      FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit                              \
+      operator fmt::basic_string_view<char_type>() const {                 \
+        return fmt::detail_exported::compile_string_to_view<char_type>(s); \
+      }                                                                    \
+    };                                                                     \
+    return FMT_COMPILE_STRING();                                           \
+  }()
+
+/**
+  \rst
+  Constructs a compile-time format string from a string literal *s*.
+
+  **Example**::
+
+    // A compile-time error because 'd' is an invalid specifier for strings.
+    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+  \endrst
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::compile_string, )
+
+#if FMT_USE_USER_DEFINED_LITERALS
+template <typename Char> struct udl_formatter {
+  basic_string_view<Char> str;
+
+  template <typename... T>
+  auto operator()(T&&... args) const -> std::basic_string<Char> {
+    return vformat(str, fmt::make_args_checked<T...>(str, args...));
+  }
+};
+
+#  if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct statically_named_arg : view {
+  static constexpr auto name = Str.data;
+
+  const T& value;
+  statically_named_arg(const T& v) : value(v) {}
+};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_named_arg<statically_named_arg<T, Char, N, Str>> : std::true_type {};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_statically_named_arg<statically_named_arg<T, Char, N, Str>>
+    : std::true_type {};
+
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_arg {
+  template <typename T> auto operator=(T&& value) const {
+    return statically_named_arg<T, Char, N, Str>(std::forward<T>(value));
+  }
+};
+#  else
+template <typename Char> struct udl_arg {
+  const Char* str;
+
+  template <typename T> auto operator=(T&& value) const -> named_arg<Char, T> {
+    return {str, std::forward<T>(value)};
+  }
+};
+#  endif
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+
+template <typename Locale, typename Char>
+auto vformat(const Locale& loc, basic_string_view<Char> format_str,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  basic_memory_buffer<Char> buffer;
+  detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
+  return {buffer.data(), buffer.size()};
+}
+
+using format_func = void (*)(detail::buffer<char>&, int, const char*);
 
 FMT_API void format_error_code(buffer<char>& out, int error_code,
                                string_view message) FMT_NOEXCEPT;
 
 FMT_API void report_error(format_func func, int error_code,
-                          string_view message) FMT_NOEXCEPT;
-}  // namespace detail
+                          const char* message) FMT_NOEXCEPT;
+FMT_END_DETAIL_NAMESPACE
 
-template <typename OutputIt, typename Char>
-using arg_formatter FMT_DEPRECATED_ALIAS =
-    detail::arg_formatter<OutputIt, Char>;
+FMT_API auto vsystem_error(int error_code, string_view format_str,
+                           format_args args) -> std::system_error;
 
 /**
- An error returned by an operating system or a language runtime,
- for example a file opening error.
+ \rst
+ Constructs :class:`std::system_error` with a message formatted with
+ ``fmt::format(fmt, args...)``.
+  *error_code* is a system error code as given by ``errno``.
+
+ **Example**::
+
+   // This throws std::system_error with the description
+   //   cannot open file 'madeup': No such file or directory
+   // or similar (system message may vary).
+   const char* filename = "madeup";
+   std::FILE* file = std::fopen(filename, "r");
+   if (!file)
+     throw fmt::system_error(errno, "cannot open file '{}'", filename);
+ \endrst
 */
-FMT_CLASS_API
-class FMT_API system_error : public std::runtime_error {
- private:
-  void init(int err_code, string_view format_str, format_args args);
-
- protected:
-  int error_code_;
-
-  system_error() : std::runtime_error(""), error_code_(0) {}
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::system_error` object with a description
-   formatted with `fmt::format_system_error`. *message* and additional
-   arguments passed into the constructor are formatted similarly to
-   `fmt::format`.
-
-   **Example**::
-
-     // This throws a system_error with the description
-     //   cannot open file 'madeup': No such file or directory
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     std::FILE *file = std::fopen(filename, "r");
-     if (!file)
-       throw fmt::system_error(errno, "cannot open file '{}'", filename);
-   \endrst
-  */
-  template <typename... Args>
-  system_error(int error_code, string_view message, const Args&... args)
-      : std::runtime_error("") {
-    init(error_code, message, make_format_args(args...));
-  }
-  system_error(const system_error&) = default;
-  system_error& operator=(const system_error&) = default;
-  system_error(system_error&&) = default;
-  system_error& operator=(system_error&&) = default;
-  ~system_error() FMT_NOEXCEPT FMT_OVERRIDE;
-
-  int error_code() const { return error_code_; }
-};
+template <typename... T>
+auto system_error(int error_code, format_string<T...> fmt, T&&... args)
+    -> std::system_error {
+  return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
+}
 
 /**
   \rst
-  Formats an error returned by an operating system or a language runtime,
-  for example a file opening error, and writes it to *out* in the following
-  form:
+  Formats an error message for an error returned by an operating system or a
+  language runtime, for example a file opening error, and writes it to *out*.
+  The format is the same as the one used by ``std::system_error(ec, message)``
+  where ``ec`` is ``std::error_code(error_code, std::generic_category()})``.
+  It is implementation-defined but normally looks like:
 
   .. parsed-literal::
      *<message>*: *<system-message>*
 
-  where *<message>* is the passed message and *<system-message>* is
-  the system message corresponding to the error code.
+  where *<message>* is the passed message and *<system-message>* is the system
+  message corresponding to the error code.
   *error_code* is a system error code as given by ``errno``.
-  If *error_code* is not a valid error code such as -1, the system message
-  may look like "Unknown error -1" and is platform-dependent.
   \endrst
  */
 FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
-                                 string_view message) FMT_NOEXCEPT;
+                                 const char* message) FMT_NOEXCEPT;
 
 // Reports a system error without throwing an exception.
 // Can be used to report errors from destructors.
 FMT_API void report_system_error(int error_code,
-                                 string_view message) FMT_NOEXCEPT;
+                                 const char* message) FMT_NOEXCEPT;
 
 /** Fast integer formatter. */
 class format_int {
@@ -3339,12 +2506,12 @@ class format_int {
   mutable char buffer_[buffer_size];
   char* str_;
 
-  template <typename UInt> char* format_unsigned(UInt value) {
+  template <typename UInt> auto format_unsigned(UInt value) -> char* {
     auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
     return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
   }
 
-  template <typename Int> char* format_signed(Int value) {
+  template <typename Int> auto format_signed(Int value) -> char* {
     auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
     bool negative = value < 0;
     if (negative) abs_value = 0 - abs_value;
@@ -3363,7 +2530,7 @@ class format_int {
       : str_(format_unsigned(value)) {}
 
   /** Returns the number of characters written to the output buffer. */
-  size_t size() const {
+  auto size() const -> size_t {
     return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
   }
 
@@ -3371,13 +2538,13 @@ class format_int {
     Returns a pointer to the output buffer content. No terminating null
     character is appended.
    */
-  const char* data() const { return str_; }
+  auto data() const -> const char* { return str_; }
 
   /**
     Returns a pointer to the output buffer content with terminating null
     character appended.
    */
-  const char* c_str() const {
+  auto c_str() const -> const char* {
     buffer_[buffer_size - 1] = '\0';
     return str_;
   }
@@ -3387,104 +2554,37 @@ class format_int {
     Returns the content of the output buffer as an ``std::string``.
     \endrst
    */
-  std::string str() const { return std::string(str_, size()); }
+  auto str() const -> std::string { return std::string(str_, size()); }
 };
 
-// A formatter specialization for the core types corresponding to detail::type
-// constants.
 template <typename T, typename Char>
-struct formatter<T, Char,
-                 enable_if_t<detail::type_constant<T, Char>::value !=
-                             detail::type::custom_type>> {
-  FMT_CONSTEXPR formatter() = default;
-
-  // Parses format specifiers stopping either at the end of the range or at the
-  // terminating '}'.
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    using handler_type = detail::dynamic_specs_handler<ParseContext>;
-    auto type = detail::type_constant<T, Char>::value;
-    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
-                                                type);
-    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
-    auto eh = ctx.error_handler();
-    switch (type) {
-    case detail::type::none_type:
-      FMT_ASSERT(false, "invalid argument type");
-      break;
-    case detail::type::int_type:
-    case detail::type::uint_type:
-    case detail::type::long_long_type:
-    case detail::type::ulong_long_type:
-    case detail::type::int128_type:
-    case detail::type::uint128_type:
-    case detail::type::bool_type:
-      handle_int_type_spec(specs_.type,
-                           detail::int_type_checker<decltype(eh)>(eh));
-      break;
-    case detail::type::char_type:
-      handle_char_specs(
-          &specs_, detail::char_specs_checker<decltype(eh)>(specs_.type, eh));
-      break;
-    case detail::type::float_type:
-      if (detail::const_check(FMT_USE_FLOAT))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "float support disabled");
-      break;
-    case detail::type::double_type:
-      if (detail::const_check(FMT_USE_DOUBLE))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "double support disabled");
-      break;
-    case detail::type::long_double_type:
-      if (detail::const_check(FMT_USE_LONG_DOUBLE))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "long double support disabled");
-      break;
-    case detail::type::cstring_type:
-      detail::handle_cstring_type_spec(
-          specs_.type, detail::cstring_type_checker<decltype(eh)>(eh));
-      break;
-    case detail::type::string_type:
-      detail::check_string_type_spec(specs_.type, eh);
-      break;
-    case detail::type::pointer_type:
-      detail::check_pointer_type_spec(specs_.type, eh);
-      break;
-    case detail::type::custom_type:
-      // Custom format specifiers should be checked in parse functions of
-      // formatter specializations.
-      break;
-    }
-    return it;
-  }
-
-  template <typename FormatContext>
-  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
-    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
-                                                       specs_.width_ref, ctx);
+template <typename FormatContext>
+FMT_CONSTEXPR FMT_INLINE auto
+formatter<T, Char,
+          enable_if_t<detail::type_constant<T, Char>::value !=
+                      detail::type::custom_type>>::format(const T& val,
+                                                          FormatContext& ctx)
+    const -> decltype(ctx.out()) {
+  if (specs_.width_ref.kind != detail::arg_id_kind::none ||
+      specs_.precision_ref.kind != detail::arg_id_kind::none) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                       specs.width_ref, ctx);
     detail::handle_dynamic_spec<detail::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx);
-    using af = detail::arg_formatter<typename FormatContext::iterator,
-                                     typename FormatContext::char_type>;
-    return visit_format_arg(af(ctx, nullptr, &specs_),
-                            detail::make_arg<FormatContext>(val));
+        specs.precision, specs.precision_ref, ctx);
+    return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
   }
+  return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
+}
 
- private:
-  detail::dynamic_format_specs<Char> specs_;
-};
-
-#define FMT_FORMAT_AS(Type, Base)                                             \
-  template <typename Char>                                                    \
-  struct formatter<Type, Char> : formatter<Base, Char> {                      \
-    template <typename FormatContext>                                         \
-    auto format(Type const& val, FormatContext& ctx) -> decltype(ctx.out()) { \
-      return formatter<Base, Char>::format(val, ctx);                         \
-    }                                                                         \
+#define FMT_FORMAT_AS(Type, Base)                                        \
+  template <typename Char>                                               \
+  struct formatter<Type, Char> : formatter<Base, Char> {                 \
+    template <typename FormatContext>                                    \
+    auto format(Type const& val, FormatContext& ctx) const               \
+        -> decltype(ctx.out()) {                                         \
+      return formatter<Base, Char>::format(static_cast<Base>(val), ctx); \
+    }                                                                    \
   }
 
 FMT_FORMAT_AS(signed char, int);
@@ -3496,12 +2596,13 @@ FMT_FORMAT_AS(unsigned long, unsigned long long);
 FMT_FORMAT_AS(Char*, const Char*);
 FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
 FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::byte, unsigned char);
 FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
 
 template <typename Char>
 struct formatter<void*, Char> : formatter<const void*, Char> {
   template <typename FormatContext>
-  auto format(void* val, FormatContext& ctx) -> decltype(ctx.out()) {
+  auto format(void* val, FormatContext& ctx) const -> decltype(ctx.out()) {
     return formatter<const void*, Char>::format(val, ctx);
   }
 };
@@ -3509,7 +2610,8 @@ struct formatter<void*, Char> : formatter<const void*, Char> {
 template <typename Char, size_t N>
 struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
   template <typename FormatContext>
-  auto format(const Char* val, FormatContext& ctx) -> decltype(ctx.out()) {
+  FMT_CONSTEXPR auto format(const Char* val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
     return formatter<basic_string_view<Char>, Char>::format(val, ctx);
   }
 };
@@ -3528,21 +2630,29 @@ struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
 //   };
 template <typename Char = char> class dynamic_formatter {
  private:
+  detail::dynamic_format_specs<Char> specs_;
+  const Char* format_str_;
+
   struct null_handler : detail::error_handler {
     void on_align(align_t) {}
-    void on_plus() {}
-    void on_minus() {}
-    void on_space() {}
+    void on_sign(sign_t) {}
     void on_hash() {}
   };
 
+  template <typename Context> void handle_specs(Context& ctx) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+  }
+
  public:
   template <typename ParseContext>
-  auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     format_str_ = ctx.begin();
     // Checks are deferred to formatting time when the argument type is known.
     detail::dynamic_specs_handler<ParseContext> handler(specs_, ctx);
-    return parse_format_specs(ctx.begin(), ctx.end(), handler);
+    return detail::parse_format_specs(ctx.begin(), ctx.end(), handler);
   }
 
   template <typename T, typename FormatContext>
@@ -3551,46 +2661,13 @@ template <typename Char = char> class dynamic_formatter {
     detail::specs_checker<null_handler> checker(
         null_handler(), detail::mapped_type_constant<T, FormatContext>::value);
     checker.on_align(specs_.align);
-    switch (specs_.sign) {
-    case sign::none:
-      break;
-    case sign::plus:
-      checker.on_plus();
-      break;
-    case sign::minus:
-      checker.on_minus();
-      break;
-    case sign::space:
-      checker.on_space();
-      break;
-    }
+    if (specs_.sign != sign::none) checker.on_sign(specs_.sign);
     if (specs_.alt) checker.on_hash();
     if (specs_.precision >= 0) checker.end_precision();
-    using af = detail::arg_formatter<typename FormatContext::iterator,
-                                     typename FormatContext::char_type>;
-    visit_format_arg(af(ctx, nullptr, &specs_),
-                     detail::make_arg<FormatContext>(val));
-    return ctx.out();
+    return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
   }
-
- private:
-  template <typename Context> void handle_specs(Context& ctx) {
-    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
-                                                       specs_.width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx);
-  }
-
-  detail::dynamic_format_specs<Char> specs_;
-  const Char* format_str_;
 };
 
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void advance_to(
-    basic_format_parse_context<Char, ErrorHandler>& ctx, const Char* p) {
-  ctx.advance_to(ctx.begin() + (p - &*ctx.begin()));
-}
-
 /**
   \rst
   Converts ``p`` to ``const void*`` for pointer formatting.
@@ -3600,11 +2677,14 @@ FMT_CONSTEXPR void advance_to(
     auto s = fmt::format("{}", fmt::ptr(p));
   \endrst
  */
-template <typename T> inline const void* ptr(const T* p) { return p; }
-template <typename T> inline const void* ptr(const std::unique_ptr<T>& p) {
+template <typename T> auto ptr(T p) -> const void* {
+  static_assert(std::is_pointer<T>::value, "");
+  return detail::bit_cast<const void*>(p);
+}
+template <typename T> auto ptr(const std::unique_ptr<T>& p) -> const void* {
   return p.get();
 }
-template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
+template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
   return p.get();
 }
 
@@ -3642,31 +2722,114 @@ template <> struct formatter<bytes> {
   }
 };
 
-template <typename It, typename Sentinel, typename Char>
-struct arg_join : detail::view {
+// group_digits_view is not derived from view because it copies the argument.
+template <typename T> struct group_digits_view { T value; };
+
+/**
+  \rst
+  Returns a view that formats an integer value using ',' as a locale-independent
+  thousands separator.
+
+  **Example**::
+
+    fmt::print("{}", fmt::group_digits(12345));
+    // Output: "12,345"
+  \endrst
+ */
+template <typename T> auto group_digits(T value) -> group_digits_view<T> {
+  return {value};
+}
+
+template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
+ private:
+  detail::dynamic_format_specs<char> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    using handler_type = detail::dynamic_specs_handler<ParseContext>;
+    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
+                                                detail::type::int_type);
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
+    detail::check_string_type_spec(specs_.type, ctx.error_handler());
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(group_digits_view<T> t, FormatContext& ctx)
+      -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    return detail::write_int_localized(
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(t.value), 0, specs_,
+        detail::digit_grouping<char>({"\3", ','}));
+  }
+};
+
+template <typename It, typename Sentinel, typename Char = char>
+struct join_view : detail::view {
   It begin;
   Sentinel end;
   basic_string_view<Char> sep;
 
-  arg_join(It b, Sentinel e, basic_string_view<Char> s)
+  join_view(It b, Sentinel e, basic_string_view<Char> s)
       : begin(b), end(e), sep(s) {}
 };
 
 template <typename It, typename Sentinel, typename Char>
-struct formatter<arg_join<It, Sentinel, Char>, Char>
-    : formatter<typename std::iterator_traits<It>::value_type, Char> {
+using arg_join FMT_DEPRECATED_ALIAS = join_view<It, Sentinel, Char>;
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<join_view<It, Sentinel, Char>, Char> {
+ private:
+  using value_type =
+#ifdef __cpp_lib_ranges
+      std::iter_value_t<It>;
+#else
+      typename std::iterator_traits<It>::value_type;
+#endif
+  using context = buffer_context<Char>;
+  using mapper = detail::arg_mapper<context>;
+
+  template <typename T, FMT_ENABLE_IF(has_formatter<T, context>::value)>
+  static auto map(const T& value) -> const T& {
+    return value;
+  }
+  template <typename T, FMT_ENABLE_IF(!has_formatter<T, context>::value)>
+  static auto map(const T& value) -> decltype(mapper().map(value)) {
+    return mapper().map(value);
+  }
+
+  using formatter_type =
+      conditional_t<is_formattable<value_type, Char>::value,
+                    formatter<remove_cvref_t<decltype(map(
+                                  std::declval<const value_type&>()))>,
+                              Char>,
+                    detail::fallback_formatter<value_type, Char>>;
+
+  formatter_type value_formatter_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return value_formatter_.parse(ctx);
+  }
+
   template <typename FormatContext>
-  auto format(const arg_join<It, Sentinel, Char>& value, FormatContext& ctx)
+  auto format(const join_view<It, Sentinel, Char>& value, FormatContext& ctx)
       -> decltype(ctx.out()) {
-    using base = formatter<typename std::iterator_traits<It>::value_type, Char>;
     auto it = value.begin;
     auto out = ctx.out();
     if (it != value.end) {
-      out = base::format(*it++, ctx);
+      out = value_formatter_.format(map(*it), ctx);
+      ++it;
       while (it != value.end) {
-        out = std::copy(value.sep.begin(), value.sep.end(), out);
+        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
         ctx.advance_to(out);
-        out = base::format(*it++, ctx);
+        out = value_formatter_.format(map(*it), ctx);
+        ++it;
       }
     }
     return out;
@@ -3674,22 +2837,17 @@ struct formatter<arg_join<It, Sentinel, Char>, Char>
 };
 
 /**
-  Returns an object that formats the iterator range `[begin, end)` with elements
+  Returns a view that formats the iterator range `[begin, end)` with elements
   separated by `sep`.
  */
 template <typename It, typename Sentinel>
-arg_join<It, Sentinel, char> join(It begin, Sentinel end, string_view sep) {
-  return {begin, end, sep};
-}
-
-template <typename It, typename Sentinel>
-arg_join<It, Sentinel, wchar_t> join(It begin, Sentinel end, wstring_view sep) {
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
   return {begin, end, sep};
 }
 
 /**
   \rst
-  Returns an object that formats `range` with elements separated by `sep`.
+  Returns a view that formats `range` with elements separated by `sep`.
 
   **Example**::
 
@@ -3704,14 +2862,8 @@ arg_join<It, Sentinel, wchar_t> join(It begin, Sentinel end, wstring_view sep) {
   \endrst
  */
 template <typename Range>
-arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, char> join(
-    Range&& range, string_view sep) {
-  return join(std::begin(range), std::end(range), sep);
-}
-
-template <typename Range>
-arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, wchar_t> join(
-    Range&& range, wstring_view sep) {
+auto join(Range&& range, string_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
   return join(std::begin(range), std::end(range), sep);
 }
 
@@ -3727,209 +2879,142 @@ arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, wchar_t> join(
   \endrst
  */
 template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-inline std::string to_string(const T& value) {
-  std::string result;
+inline auto to_string(const T& value) -> std::string {
+  auto result = std::string();
   detail::write<char>(std::back_inserter(result), value);
   return result;
 }
 
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline std::string to_string(T value) {
-  // The buffer should be large enough to store the number including the sign or
-  // "false" for bool.
+FMT_NODISCARD inline auto to_string(T value) -> std::string {
+  // The buffer should be large enough to store the number including the sign
+  // or "false" for bool.
   constexpr int max_size = detail::digits10<T>() + 2;
   char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
   char* begin = buffer;
   return std::string(begin, detail::write<char>(begin, value));
 }
 
-/**
-  Converts *value* to ``std::wstring`` using the default format for type *T*.
- */
-template <typename T> inline std::wstring to_wstring(const T& value) {
-  return format(L"{}", value);
-}
-
 template <typename Char, size_t SIZE>
-std::basic_string<Char> to_string(const basic_memory_buffer<Char, SIZE>& buf) {
+FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE>& buf)
+    -> std::basic_string<Char> {
   auto size = buf.size();
   detail::assume(size < std::basic_string<Char>().max_size());
   return std::basic_string<Char>(buf.data(), size);
 }
 
+FMT_BEGIN_DETAIL_NAMESPACE
+
 template <typename Char>
-void detail::vformat_to(
-    detail::buffer<Char>& buf, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args,
-    detail::locale_ref loc) {
-  using iterator = typename buffer_context<Char>::iterator;
+void vformat_to(
+    buffer<Char>& buf, basic_string_view<Char> fmt,
+    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args,
+    locale_ref loc) {
+  // workaround for msvc bug regarding name-lookup in module
+  // link names into function scope
+  using detail::arg_formatter;
+  using detail::buffer_appender;
+  using detail::custom_formatter;
+  using detail::default_arg_formatter;
+  using detail::get_arg;
+  using detail::locale_ref;
+  using detail::parse_format_specs;
+  using detail::specs_checker;
+  using detail::specs_handler;
+  using detail::to_unsigned;
+  using detail::type;
+  using detail::write;
   auto out = buffer_appender<Char>(buf);
-  if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
+  if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
     auto arg = args.get(0);
     if (!arg) error_handler().on_error("argument not found");
-    visit_format_arg(default_arg_formatter<iterator, Char>{out, args, loc},
-                     arg);
+    visit_format_arg(default_arg_formatter<Char>{out, args, loc}, arg);
     return;
   }
-  format_handler<iterator, Char, buffer_context<Char>> h(out, format_str, args,
-                                                         loc);
-  parse_format_string<false>(format_str, h);
+
+  struct format_handler : error_handler {
+    basic_format_parse_context<Char> parse_context;
+    buffer_context<Char> context;
+
+    format_handler(buffer_appender<Char> out, basic_string_view<Char> str,
+                   basic_format_args<buffer_context<Char>> args, locale_ref loc)
+        : parse_context(str), context(out, args, loc) {}
+
+    void on_text(const Char* begin, const Char* end) {
+      auto text = basic_string_view<Char>(begin, to_unsigned(end - begin));
+      context.advance_to(write<Char>(context.out(), text));
+    }
+
+    FMT_CONSTEXPR auto on_arg_id() -> int {
+      return parse_context.next_arg_id();
+    }
+    FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+      return parse_context.check_arg_id(id), id;
+    }
+    FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+      int arg_id = context.arg_id(id);
+      if (arg_id < 0) on_error("argument not found");
+      return arg_id;
+    }
+
+    FMT_INLINE void on_replacement_field(int id, const Char*) {
+      auto arg = get_arg(context, id);
+      context.advance_to(visit_format_arg(
+          default_arg_formatter<Char>{context.out(), context.args(),
+                                      context.locale()},
+          arg));
+    }
+
+    auto on_format_specs(int id, const Char* begin, const Char* end)
+        -> const Char* {
+      auto arg = get_arg(context, id);
+      if (arg.type() == type::custom_type) {
+        parse_context.advance_to(parse_context.begin() +
+                                 (begin - &*parse_context.begin()));
+        visit_format_arg(custom_formatter<Char>{parse_context, context}, arg);
+        return parse_context.begin();
+      }
+      auto specs = basic_format_specs<Char>();
+      specs_checker<specs_handler<Char>> handler(
+          specs_handler<Char>(specs, parse_context, context), arg.type());
+      begin = parse_format_specs(begin, end, handler);
+      if (begin == end || *begin != '}')
+        on_error("missing '}' in format string");
+      auto f = arg_formatter<Char>{context.out(), specs, context.locale()};
+      context.advance_to(visit_format_arg(f, arg));
+      return begin;
+    }
+  };
+  detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
 }
 
 #ifndef FMT_HEADER_ONLY
-extern template void detail::vformat_to(detail::buffer<char>&, string_view,
-                                        basic_format_args<format_context>,
-                                        detail::locale_ref);
-namespace detail {
+extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
+    -> thousands_sep_result<char>;
+extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+extern template auto format_float<double>(double value, int precision,
+                                          float_specs specs, buffer<char>& buf)
+    -> int;
+extern template auto format_float<long double>(long double value, int precision,
+                                               float_specs specs,
+                                               buffer<char>& buf) -> int;
+void snprintf_float(float, int, float_specs, buffer<char>&) = delete;
+extern template auto snprintf_float<double>(double value, int precision,
+                                            float_specs specs,
+                                            buffer<char>& buf) -> int;
+extern template auto snprintf_float<long double>(long double value,
+                                                 int precision,
+                                                 float_specs specs,
+                                                 buffer<char>& buf) -> int;
+#endif  // FMT_HEADER_ONLY
 
-extern template FMT_API std::string grouping_impl<char>(locale_ref loc);
-extern template FMT_API std::string grouping_impl<wchar_t>(locale_ref loc);
-extern template FMT_API char thousands_sep_impl<char>(locale_ref loc);
-extern template FMT_API wchar_t thousands_sep_impl<wchar_t>(locale_ref loc);
-extern template FMT_API char decimal_point_impl(locale_ref loc);
-extern template FMT_API wchar_t decimal_point_impl(locale_ref loc);
-extern template int format_float<double>(double value, int precision,
-                                         float_specs specs, buffer<char>& buf);
-extern template int format_float<long double>(long double value, int precision,
-                                              float_specs specs,
-                                              buffer<char>& buf);
-int snprintf_float(float value, int precision, float_specs specs,
-                   buffer<char>& buf) = delete;
-extern template int snprintf_float<double>(double value, int precision,
-                                           float_specs specs,
-                                           buffer<char>& buf);
-extern template int snprintf_float<long double>(long double value,
-                                                int precision,
-                                                float_specs specs,
-                                                buffer<char>& buf);
-}  // namespace detail
-#endif
-
-template <typename S, typename Char = char_t<S>,
-          FMT_ENABLE_IF(detail::is_string<S>::value)>
-inline void vformat_to(
-    detail::buffer<Char>& buf, const S& format_str,
-    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args) {
-  return detail::vformat_to(buf, to_string_view(format_str), args);
-}
-
-template <typename S, typename... Args, size_t SIZE = inline_buffer_size,
-          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
-inline typename buffer_context<Char>::iterator format_to(
-    basic_memory_buffer<Char, SIZE>& buf, const S& format_str, Args&&... args) {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  detail::vformat_to(buf, to_string_view(format_str), vargs);
-  return detail::buffer_appender<Char>(buf);
-}
-
-template <typename OutputIt, typename Char = char>
-using format_context_t = basic_format_context<OutputIt, Char>;
-
-template <typename OutputIt, typename Char = char>
-using format_args_t = basic_format_args<format_context_t<OutputIt, Char>>;
-
-template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_context FMT_DEPRECATED_ALIAS = buffer_context<Char>;
-
-template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_args FMT_DEPRECATED_ALIAS =
-    basic_format_args<buffer_context<Char>>;
-
-template <typename OutputIt, typename Char, typename... Args>
-FMT_DEPRECATED format_arg_store<buffer_context<Char>, Args...>
-make_format_to_n_args(const Args&... args) {
-  return format_arg_store<buffer_context<Char>, Args...>(args...);
-}
-
-template <typename Char, enable_if_t<(!std::is_same<Char, char>::value), int>>
-std::basic_string<Char> detail::vformat(
-    basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  basic_memory_buffer<Char> buffer;
-  detail::vformat_to(buffer, format_str, args);
-  return to_string(buffer);
-}
-
-template <typename Char, FMT_ENABLE_IF(std::is_same<Char, wchar_t>::value)>
-void vprint(std::FILE* f, basic_string_view<Char> format_str,
-            wformat_args args) {
-  wmemory_buffer buffer;
-  detail::vformat_to(buffer, format_str, args);
-  buffer.push_back(L'\0');
-  if (std::fputws(buffer.data(), f) == -1)
-    FMT_THROW(system_error(errno, "cannot write to file"));
-}
-
-template <typename Char, FMT_ENABLE_IF(std::is_same<Char, wchar_t>::value)>
-void vprint(basic_string_view<Char> format_str, wformat_args args) {
-  vprint(stdout, format_str, args);
-}
+FMT_END_DETAIL_NAMESPACE
 
 #if FMT_USE_USER_DEFINED_LITERALS
-namespace detail {
-
-#  if FMT_USE_UDL_TEMPLATE
-template <typename Char, Char... CHARS> class udl_formatter {
- public:
-  template <typename... Args>
-  std::basic_string<Char> operator()(Args&&... args) const {
-    static FMT_CONSTEXPR_DECL Char s[] = {CHARS..., '\0'};
-    return format(FMT_STRING(s), std::forward<Args>(args)...);
-  }
-};
-#  else
-template <typename Char> struct udl_formatter {
-  basic_string_view<Char> str;
-
-  template <typename... Args>
-  std::basic_string<Char> operator()(Args&&... args) const {
-    return format(str, std::forward<Args>(args)...);
-  }
-};
-#  endif  // FMT_USE_UDL_TEMPLATE
-
-template <typename Char> struct udl_arg {
-  const Char* str;
-
-  template <typename T> named_arg<Char, T> operator=(T&& value) const {
-    return {str, std::forward<T>(value)};
-  }
-};
-}  // namespace detail
-
 inline namespace literals {
-#  if FMT_USE_UDL_TEMPLATE
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wpedantic"
-#    if FMT_CLANG_VERSION
-#      pragma GCC diagnostic ignored "-Wgnu-string-literal-operator-template"
-#    endif
-template <typename Char, Char... CHARS>
-FMT_CONSTEXPR detail::udl_formatter<Char, CHARS...> operator""_format() {
-  return {};
-}
-#    pragma GCC diagnostic pop
-#  else
-/**
-  \rst
-  User-defined literal equivalent of :func:`fmt::format`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    std::string message = "The answer is {}"_format(42);
-  \endrst
- */
-FMT_CONSTEXPR detail::udl_formatter<char> operator"" _format(const char* s,
-                                                             size_t n) {
-  return {{s, n}};
-}
-FMT_CONSTEXPR detail::udl_formatter<wchar_t> operator"" _format(
-    const wchar_t* s, size_t n) {
-  return {{s, n}};
-}
-#  endif  // FMT_USE_UDL_TEMPLATE
-
 /**
   \rst
   User-defined literal equivalent of :func:`fmt::arg`.
@@ -3940,16 +3025,75 @@ FMT_CONSTEXPR detail::udl_formatter<wchar_t> operator"" _format(
     fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
   \endrst
  */
-FMT_CONSTEXPR detail::udl_arg<char> operator"" _a(const char* s, size_t) {
+#  if FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+template <detail_exported::fixed_string Str>
+constexpr auto operator""_a()
+    -> detail::udl_arg<remove_cvref_t<decltype(Str.data[0])>,
+                       sizeof(Str.data) / sizeof(decltype(Str.data[0])), Str> {
+  return {};
+}
+#  else
+constexpr auto operator"" _a(const char* s, size_t) -> detail::udl_arg<char> {
   return {s};
 }
-FMT_CONSTEXPR detail::udl_arg<wchar_t> operator"" _a(const wchar_t* s, size_t) {
-  return {s};
+#  endif
+
+// DEPRECATED!
+// User-defined literal equivalent of fmt::format.
+FMT_DEPRECATED constexpr auto operator"" _format(const char* s, size_t n)
+    -> detail::udl_formatter<char> {
+  return {{s, n}};
 }
 }  // namespace literals
 #endif  // FMT_USE_USER_DEFINED_LITERALS
+
+template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto vformat(const Locale& loc, string_view fmt, format_args args)
+    -> std::string {
+  return detail::vformat(loc, fmt, args);
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(loc, string_view(fmt), fmt::make_format_args(args...));
+}
+
+template <typename... T, size_t SIZE, typename Allocator>
+FMT_DEPRECATED auto format_to(basic_memory_buffer<char, SIZE, Allocator>& buf,
+                              format_string<T...> fmt, T&&... args)
+    -> appender {
+  detail::vformat_to(buf, string_view(fmt), fmt::make_format_args(args...));
+  return appender(buf);
+}
+
+template <typename OutputIt, typename Locale,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+auto vformat_to(OutputIt out, const Locale& loc, string_view fmt,
+                format_args args) -> OutputIt {
+  using detail::get_buffer;
+  auto&& buf = get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return detail::get_iterator(buf);
+}
+
+template <typename OutputIt, typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+FMT_INLINE auto format_to(OutputIt out, const Locale& loc,
+                          format_string<T...> fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, loc, fmt, fmt::make_format_args(args...));
+}
+
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
+#ifdef FMT_DEPRECATED_INCLUDE_XCHAR
+#  include "xchar.h"
+#endif
+
 #ifdef FMT_HEADER_ONLY
 #  define FMT_FUNC inline
 #  include "format-inl.h"
diff --git a/src/fmt/locale.h b/src/fmt/locale.h
index 7301bf92a2..7571b5261b 100644
--- a/src/fmt/locale.h
+++ b/src/fmt/locale.h
@@ -1,64 +1,2 @@
-// Formatting library for C++ - std::locale support
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_LOCALE_H_
-#define FMT_LOCALE_H_
-
-#include <locale>
-
-#include "format.h"
-
-FMT_BEGIN_NAMESPACE
-
-namespace detail {
-template <typename Char>
-std::basic_string<Char> vformat(
-    const std::locale& loc, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  basic_memory_buffer<Char> buffer;
-  detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
-  return fmt::to_string(buffer);
-}
-}  // namespace detail
-
-template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vformat(
-    const std::locale& loc, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  return detail::vformat(loc, to_string_view(format_str), args);
-}
-
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline std::basic_string<Char> format(const std::locale& loc,
-                                      const S& format_str, Args&&... args) {
-  return detail::vformat(loc, to_string_view(format_str),
-                         fmt::make_args_checked<Args...>(format_str, args...));
-}
-
-template <typename S, typename OutputIt, typename... Args,
-          typename Char = char_t<S>,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-inline OutputIt vformat_to(
-    OutputIt out, const std::locale& loc, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
-  vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc));
-  return detail::get_iterator(buf);
-}
-
-template <typename OutputIt, typename S, typename... Args,
-          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
-inline auto format_to(OutputIt out, const std::locale& loc,
-                      const S& format_str, Args&&... args) ->
-    typename std::enable_if<enable, OutputIt>::type {
-  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
-  return vformat_to(out, loc, to_string_view(format_str), vargs);
-}
-
-FMT_END_NAMESPACE
-
-#endif  // FMT_LOCALE_H_
+#include "xchar.h"
+#warning fmt/locale.h is deprecated, include fmt/format.h or fmt/xchar.h instead
diff --git a/src/fmt/os.h b/src/fmt/os.h
index e28738041a..b64f8bbfa5 100644
--- a/src/fmt/os.h
+++ b/src/fmt/os.h
@@ -8,16 +8,12 @@
 #ifndef FMT_OS_H_
 #define FMT_OS_H_
 
-#if defined(__MINGW32__) || defined(__CYGWIN__)
-// Workaround MinGW bug https://sourceforge.net/p/mingw/bugs/2024/.
-#  undef __STRICT_ANSI__
-#endif
-
 #include <cerrno>
-#include <clocale>  // for locale_t
+#include <clocale>  // locale_t
 #include <cstddef>
 #include <cstdio>
-#include <cstdlib>  // for strtod_l
+#include <cstdlib>       // strtod_l
+#include <system_error>  // std::system_error
 
 #if defined __APPLE__ || defined(__FreeBSD__)
 #  include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
@@ -25,17 +21,20 @@
 
 #include "format.h"
 
+#ifndef FMT_USE_FCNTL
 // UWP doesn't provide _pipe.
-#if FMT_HAS_INCLUDE("winapifamily.h")
-#  include <winapifamily.h>
-#endif
-#if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
-     defined(__linux__)) &&                              \
-    (!defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
-#  include <fcntl.h>  // for O_RDONLY
-#  define FMT_USE_FCNTL 1
-#else
-#  define FMT_USE_FCNTL 0
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
+       defined(__linux__)) &&                              \
+      (!defined(WINAPI_FAMILY) ||                          \
+       (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    include <fcntl.h>  // for O_RDONLY
+#    define FMT_USE_FCNTL 1
+#  else
+#    define FMT_USE_FCNTL 0
+#  endif
 #endif
 
 #ifndef FMT_POSIX
@@ -74,6 +73,7 @@
 #define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
 
 FMT_BEGIN_NAMESPACE
+FMT_MODULE_EXPORT_BEGIN
 
 /**
   \rst
@@ -122,19 +122,28 @@ template <typename Char> class basic_cstring_view {
 using cstring_view = basic_cstring_view<char>;
 using wcstring_view = basic_cstring_view<wchar_t>;
 
-// An error code.
-class error_code {
- private:
-  int value_;
+template <typename Char> struct formatter<std::error_code, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
 
- public:
-  explicit error_code(int value = 0) FMT_NOEXCEPT : value_(value) {}
-
-  int get() const FMT_NOEXCEPT { return value_; }
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const std::error_code& ec, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write_bytes(out, ec.category().name(),
+                              basic_format_specs<Char>());
+    out = detail::write<Char>(out, Char(':'));
+    out = detail::write<Char>(out, ec.value());
+    return out;
+  }
 };
 
 #ifdef _WIN32
-namespace detail {
+FMT_API const std::error_category& system_category() FMT_NOEXCEPT;
+
+FMT_BEGIN_DETAIL_NAMESPACE
 // A converter from UTF-16 to UTF-8.
 // It is only provided for Windows since other systems support UTF-8 natively.
 class utf16_to_utf8 {
@@ -143,7 +152,7 @@ class utf16_to_utf8 {
 
  public:
   utf16_to_utf8() {}
-  FMT_API explicit utf16_to_utf8(wstring_view s);
+  FMT_API explicit utf16_to_utf8(basic_string_view<wchar_t> s);
   operator string_view() const { return string_view(&buffer_[0], size()); }
   size_t size() const { return buffer_.size() - 1; }
   const char* c_str() const { return &buffer_[0]; }
@@ -152,59 +161,68 @@ class utf16_to_utf8 {
   // Performs conversion returning a system error code instead of
   // throwing exception on conversion error. This method may still throw
   // in case of memory allocation error.
-  FMT_API int convert(wstring_view s);
+  FMT_API int convert(basic_string_view<wchar_t> s);
 };
 
 FMT_API void format_windows_error(buffer<char>& out, int error_code,
-                                  string_view message) FMT_NOEXCEPT;
-}  // namespace detail
+                                  const char* message) FMT_NOEXCEPT;
+FMT_END_DETAIL_NAMESPACE
 
-/** A Windows error. */
-class windows_error : public system_error {
- private:
-  FMT_API void init(int error_code, string_view format_str, format_args args);
+FMT_API std::system_error vwindows_error(int error_code, string_view format_str,
+                                         format_args args);
 
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::windows_error` object with the description
-   of the form
+/**
+ \rst
+ Constructs a :class:`std::system_error` object with the description
+ of the form
 
-   .. parsed-literal::
-     *<message>*: *<system-message>*
+ .. parsed-literal::
+   *<message>*: *<system-message>*
 
-   where *<message>* is the formatted message and *<system-message>* is the
-   system message corresponding to the error code.
-   *error_code* is a Windows error code as given by ``GetLastError``.
-   If *error_code* is not a valid error code such as -1, the system message
-   will look like "error -1".
+ where *<message>* is the formatted message and *<system-message>* is the
+ system message corresponding to the error code.
+ *error_code* is a Windows error code as given by ``GetLastError``.
+ If *error_code* is not a valid error code such as -1, the system message
+ will look like "error -1".
 
-   **Example**::
+ **Example**::
 
-     // This throws a windows_error with the description
-     //   cannot open file 'madeup': The system cannot find the file specified.
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     LPOFSTRUCT of = LPOFSTRUCT();
-     HFILE file = OpenFile(filename, &of, OF_READ);
-     if (file == HFILE_ERROR) {
-       throw fmt::windows_error(GetLastError(),
-                                "cannot open file '{}'", filename);
-     }
-   \endrst
-  */
-  template <typename... Args>
-  windows_error(int error_code, string_view message, const Args&... args) {
-    init(error_code, message, make_format_args(args...));
-  }
-};
+   // This throws a system_error with the description
+   //   cannot open file 'madeup': The system cannot find the file specified.
+   // or similar (system message may vary).
+   const char *filename = "madeup";
+   LPOFSTRUCT of = LPOFSTRUCT();
+   HFILE file = OpenFile(filename, &of, OF_READ);
+   if (file == HFILE_ERROR) {
+     throw fmt::windows_error(GetLastError(),
+                              "cannot open file '{}'", filename);
+   }
+ \endrst
+*/
+template <typename... Args>
+std::system_error windows_error(int error_code, string_view message,
+                                const Args&... args) {
+  return vwindows_error(error_code, message, fmt::make_format_args(args...));
+}
 
 // Reports a Windows error without throwing an exception.
 // Can be used to report errors from destructors.
 FMT_API void report_windows_error(int error_code,
-                                  string_view message) FMT_NOEXCEPT;
+                                  const char* message) FMT_NOEXCEPT;
+#else
+inline const std::error_category& system_category() FMT_NOEXCEPT {
+  return std::system_category();
+}
 #endif  // _WIN32
 
+// std::system is not available on some platforms such as iOS (#2248).
+#ifdef __OSX__
+template <typename S, typename... Args, typename Char = char_t<S>>
+void say(const S& format_str, Args&&... args) {
+  std::system(format("say \"{}\"", format(format_str, args...)).c_str());
+}
+#endif
+
 // A buffered file.
 class buffered_file {
  private:
@@ -255,7 +273,7 @@ class buffered_file {
 
   template <typename... Args>
   inline void print(string_view format_str, const Args&... args) {
-    vprint(format_str, make_format_args(args...));
+    vprint(format_str, fmt::make_format_args(args...));
   }
 };
 
@@ -280,7 +298,8 @@ class file {
     WRONLY = FMT_POSIX(O_WRONLY),  // Open for writing only.
     RDWR = FMT_POSIX(O_RDWR),      // Open for reading and writing.
     CREATE = FMT_POSIX(O_CREAT),   // Create if the file doesn't exist.
-    APPEND = FMT_POSIX(O_APPEND)   // Open in append mode.
+    APPEND = FMT_POSIX(O_APPEND),  // Open in append mode.
+    TRUNC = FMT_POSIX(O_TRUNC)     // Truncate the content of the file.
   };
 
   // Constructs a file object which doesn't represent any file.
@@ -295,7 +314,8 @@ class file {
 
   file(file&& other) FMT_NOEXCEPT : fd_(other.fd_) { other.fd_ = -1; }
 
-  file& operator=(file&& other) FMT_NOEXCEPT {
+  // Move assignment is not noexcept because close may throw.
+  file& operator=(file&& other) {
     close();
     fd_ = other.fd_;
     other.fd_ = -1;
@@ -331,7 +351,7 @@ class file {
 
   // Makes fd be the copy of this file descriptor, closing fd first if
   // necessary.
-  FMT_API void dup2(int fd, error_code& ec) FMT_NOEXCEPT;
+  FMT_API void dup2(int fd, std::error_code& ec) FMT_NOEXCEPT;
 
   // Creates a pipe setting up read_end and write_end file objects for reading
   // and writing respectively.
@@ -345,9 +365,10 @@ class file {
 // Returns the memory page size.
 long getpagesize();
 
-namespace detail {
+FMT_BEGIN_DETAIL_NAMESPACE
 
 struct buffer_size {
+  buffer_size() = default;
   size_t value = 0;
   buffer_size operator=(size_t val) const {
     auto bs = buffer_size();
@@ -357,14 +378,14 @@ struct buffer_size {
 };
 
 struct ostream_params {
-  int oflag = file::WRONLY | file::CREATE;
+  int oflag = file::WRONLY | file::CREATE | file::TRUNC;
   size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
 
   ostream_params() {}
 
   template <typename... T>
-  ostream_params(T... params, int oflag) : ostream_params(params...) {
-    this->oflag = oflag;
+  ostream_params(T... params, int new_oflag) : ostream_params(params...) {
+    oflag = new_oflag;
   }
 
   template <typename... T>
@@ -372,23 +393,27 @@ struct ostream_params {
       : ostream_params(params...) {
     this->buffer_size = bs.value;
   }
+
+// Intel has a bug that results in failure to deduce a constructor
+// for empty parameter packs.
+#  if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 2000
+  ostream_params(int new_oflag) : oflag(new_oflag) {}
+  ostream_params(detail::buffer_size bs) : buffer_size(bs.value) {}
+#  endif
 };
-}  // namespace detail
 
-static detail::buffer_size buffer_size;
+FMT_END_DETAIL_NAMESPACE
 
-// A fast output stream which is not thread-safe.
-class ostream final : private detail::buffer<char> {
+// Added {} below to work around default constructor error known to
+// occur in Xcode versions 7.2.1 and 8.2.1.
+constexpr detail::buffer_size buffer_size{};
+
+/** A fast output stream which is not thread-safe. */
+class FMT_API ostream final : private detail::buffer<char> {
  private:
   file file_;
 
-  void flush() {
-    if (size() == 0) return;
-    file_.write(data(), size());
-    clear();
-  }
-
-  FMT_API void grow(size_t) override final;
+  void grow(size_t) override;
 
   ostream(cstring_view path, const detail::ostream_params& params)
       : file_(path, params.oflag) {
@@ -399,6 +424,7 @@ class ostream final : private detail::buffer<char> {
   ostream(ostream&& other)
       : detail::buffer<char>(other.data(), other.size(), other.capacity()),
         file_(std::move(other.file_)) {
+    other.clear();
     other.set(nullptr, 0);
   }
   ~ostream() {
@@ -406,6 +432,12 @@ class ostream final : private detail::buffer<char> {
     delete[] data();
   }
 
+  void flush() {
+    if (size() == 0) return;
+    file_.write(data(), size());
+    clear();
+  }
+
   template <typename... T>
   friend ostream output_file(cstring_view path, T... params);
 
@@ -414,16 +446,30 @@ class ostream final : private detail::buffer<char> {
     file_.close();
   }
 
-  template <typename S, typename... Args>
-  void print(const S& format_str, const Args&... args) {
-    format_to(detail::buffer_appender<char>(*this), format_str, args...);
+  /**
+    Formats ``args`` according to specifications in ``fmt`` and writes the
+    output to the file.
+   */
+  template <typename... T> void print(format_string<T...> fmt, T&&... args) {
+    vformat_to(detail::buffer_appender<char>(*this), fmt,
+               fmt::make_format_args(args...));
   }
 };
 
 /**
-  Opens a file for writing. Supported parameters passed in `params`:
-  * ``<integer>``: Output flags (``file::WRONLY | file::CREATE`` by default)
+  \rst
+  Opens a file for writing. Supported parameters passed in *params*:
+
+  * ``<integer>``: Flags passed to `open
+    <https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html>`_
+    (``file::WRONLY | file::CREATE`` by default)
   * ``buffer_size=<integer>``: Output buffer size
+
+  **Example**::
+
+    auto out = fmt::output_file("guide.txt");
+    out.print("Don't {}", "Panic");
+  \endrst
  */
 template <typename... T>
 inline ostream output_file(cstring_view path, T... params) {
@@ -466,7 +512,7 @@ class locale {
 
   // Converts string to floating-point number and advances str past the end
   // of the parsed input.
-  double strtod(const char*& str) const {
+  FMT_DEPRECATED double strtod(const char*& str) const {
     char* end = nullptr;
     double result = strtod_l(str, &end, locale_);
     str = end;
@@ -475,6 +521,7 @@ class locale {
 };
 using Locale FMT_DEPRECATED_ALIAS = locale;
 #endif  // FMT_LOCALE
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_OS_H_
diff --git a/src/fmt/ostream.h b/src/fmt/ostream.h
index 29c58ec13b..3d716ece84 100644
--- a/src/fmt/ostream.h
+++ b/src/fmt/ostream.h
@@ -14,81 +14,44 @@
 
 FMT_BEGIN_NAMESPACE
 
-template <typename Char> class basic_printf_parse_context;
 template <typename OutputIt, typename Char> class basic_printf_context;
 
 namespace detail {
 
-template <class Char> class formatbuf : public std::basic_streambuf<Char> {
- private:
-  using int_type = typename std::basic_streambuf<Char>::int_type;
-  using traits_type = typename std::basic_streambuf<Char>::traits_type;
-
-  buffer<Char>& buffer_;
-
- public:
-  formatbuf(buffer<Char>& buf) : buffer_(buf) {}
-
- protected:
-  // The put-area is actually always empty. This makes the implementation
-  // simpler and has the advantage that the streambuf and the buffer are always
-  // in sync and sputc never writes into uninitialized memory. The obvious
-  // disadvantage is that each call to sputc always results in a (virtual) call
-  // to overflow. There is no disadvantage here for sputn since this always
-  // results in a call to xsputn.
-
-  int_type overflow(int_type ch = traits_type::eof()) FMT_OVERRIDE {
-    if (!traits_type::eq_int_type(ch, traits_type::eof()))
-      buffer_.push_back(static_cast<Char>(ch));
-    return ch;
-  }
-
-  std::streamsize xsputn(const Char* s, std::streamsize count) FMT_OVERRIDE {
-    buffer_.append(s, s + count);
-    return count;
-  }
-};
-
-struct converter {
-  template <typename T, FMT_ENABLE_IF(is_integral<T>::value)> converter(T);
-};
-
-template <typename Char> struct test_stream : std::basic_ostream<Char> {
- private:
-  void_t<> operator<<(converter);
-};
-
-// Hide insertion operators for built-in types.
-template <typename Char, typename Traits>
-void_t<> operator<<(std::basic_ostream<Char, Traits>&, Char);
-template <typename Char, typename Traits>
-void_t<> operator<<(std::basic_ostream<Char, Traits>&, char);
-template <typename Traits>
-void_t<> operator<<(std::basic_ostream<char, Traits>&, char);
-template <typename Traits>
-void_t<> operator<<(std::basic_ostream<char, Traits>&, signed char);
-template <typename Traits>
-void_t<> operator<<(std::basic_ostream<char, Traits>&, unsigned char);
-
-// Checks if T has a user-defined operator<< (e.g. not a member of
-// std::ostream).
-template <typename T, typename Char> class is_streamable {
+// Checks if T has a user-defined operator<<.
+template <typename T, typename Char, typename Enable = void>
+class is_streamable {
  private:
   template <typename U>
-  static bool_constant<!std::is_same<decltype(std::declval<test_stream<Char>&>()
-                                              << std::declval<U>()),
-                                     void_t<>>::value>
-  test(int);
+  static auto test(int)
+      -> bool_constant<sizeof(std::declval<std::basic_ostream<Char>&>()
+                              << std::declval<U>()) != 0>;
 
-  template <typename> static std::false_type test(...);
+  template <typename> static auto test(...) -> std::false_type;
 
   using result = decltype(test<T>(0));
 
  public:
+  is_streamable() = default;
+
   static const bool value = result::value;
 };
 
+// Formatting of built-in types and arrays is intentionally disabled because
+// it's handled by standard (non-ostream) formatters.
+template <typename T, typename Char>
+struct is_streamable<
+    T, Char,
+    enable_if_t<
+        std::is_arithmetic<T>::value || std::is_array<T>::value ||
+        std::is_pointer<T>::value || std::is_same<T, char8_type>::value ||
+        std::is_same<T, std::basic_string<Char>>::value ||
+        std::is_same<T, std_string_view<Char>>::value ||
+        (std::is_convertible<T, int>::value && !std::is_enum<T>::value)>>
+    : std::false_type {};
+
 // Write the content of buf to os.
+// It is a separate function rather than a part of vprint to simplify testing.
 template <typename Char>
 void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
   const Char* buf_data = buf.data();
@@ -106,8 +69,8 @@ void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
 template <typename Char, typename T>
 void format_value(buffer<Char>& buf, const T& value,
                   locale_ref loc = locale_ref()) {
-  formatbuf<Char> format_buf(buf);
-  std::basic_ostream<Char> output(&format_buf);
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& output = std::basic_ostream<Char>(&format_buf);
 #if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
   if (loc) output.imbue(loc.get<std::locale>());
 #endif
@@ -120,39 +83,33 @@ void format_value(buffer<Char>& buf, const T& value,
 template <typename T, typename Char>
 struct fallback_formatter<T, Char, enable_if_t<is_streamable<T, Char>::value>>
     : private formatter<basic_string_view<Char>, Char> {
-  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
-      -> decltype(ctx.begin()) {
-    return formatter<basic_string_view<Char>, Char>::parse(ctx);
-  }
-  template <typename ParseCtx,
-            FMT_ENABLE_IF(std::is_same<
-                          ParseCtx, basic_printf_parse_context<Char>>::value)>
-  auto parse(ParseCtx& ctx) -> decltype(ctx.begin()) {
-    return ctx.begin();
-  }
+  using formatter<basic_string_view<Char>, Char>::parse;
 
   template <typename OutputIt>
   auto format(const T& value, basic_format_context<OutputIt, Char>& ctx)
       -> OutputIt {
-    basic_memory_buffer<Char> buffer;
+    auto buffer = basic_memory_buffer<Char>();
     format_value(buffer, value, ctx.locale());
-    basic_string_view<Char> str(buffer.data(), buffer.size());
-    return formatter<basic_string_view<Char>, Char>::format(str, ctx);
+    return formatter<basic_string_view<Char>, Char>::format(
+        {buffer.data(), buffer.size()}, ctx);
   }
+
+  // DEPRECATED!
   template <typename OutputIt>
   auto format(const T& value, basic_printf_context<OutputIt, Char>& ctx)
       -> OutputIt {
-    basic_memory_buffer<Char> buffer;
+    auto buffer = basic_memory_buffer<Char>();
     format_value(buffer, value, ctx.locale());
     return std::copy(buffer.begin(), buffer.end(), ctx.out());
   }
 };
 }  // namespace detail
 
+FMT_MODULE_EXPORT
 template <typename Char>
 void vprint(std::basic_ostream<Char>& os, basic_string_view<Char> format_str,
             basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  basic_memory_buffer<Char> buffer;
+  auto buffer = basic_memory_buffer<Char>();
   detail::vformat_to(buffer, format_str, args);
   detail::write_buffer(os, buffer);
 }
@@ -166,6 +123,7 @@ void vprint(std::basic_ostream<Char>& os, basic_string_view<Char> format_str,
     fmt::print(cerr, "Don't {}!", "panic");
   \endrst
  */
+FMT_MODULE_EXPORT
 template <typename S, typename... Args,
           typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 void print(std::basic_ostream<Char>& os, const S& format_str, Args&&... args) {
diff --git a/src/fmt/printf.h b/src/fmt/printf.h
index 8c28ac2327..19d550f6cf 100644
--- a/src/fmt/printf.h
+++ b/src/fmt/printf.h
@@ -10,11 +10,54 @@
 
 #include <algorithm>  // std::max
 #include <limits>     // std::numeric_limits
+#include <ostream>
 
-#include "ostream.h"
+#include "format.h"
 
 FMT_BEGIN_NAMESPACE
-namespace detail {
+FMT_MODULE_EXPORT_BEGIN
+
+template <typename T> struct printf_formatter { printf_formatter() = delete; };
+
+template <typename Char>
+class basic_printf_parse_context : public basic_format_parse_context<Char> {
+  using basic_format_parse_context<Char>::basic_format_parse_context;
+};
+
+template <typename OutputIt, typename Char> class basic_printf_context {
+ private:
+  OutputIt out_;
+  basic_format_args<basic_printf_context> args_;
+
+ public:
+  using char_type = Char;
+  using format_arg = basic_format_arg<basic_printf_context>;
+  using parse_context_type = basic_printf_parse_context<Char>;
+  template <typename T> using formatter_type = printf_formatter<T>;
+
+  /**
+    \rst
+    Constructs a ``printf_context`` object. References to the arguments are
+    stored in the context object so make sure they have appropriate lifetimes.
+    \endrst
+   */
+  basic_printf_context(OutputIt out,
+                       basic_format_args<basic_printf_context> args)
+      : out_(out), args_(args) {}
+
+  OutputIt out() { return out_; }
+  void advance_to(OutputIt it) { out_ = it; }
+
+  detail::locale_ref locale() { return {}; }
+
+  format_arg arg(int id) const { return args_.get(id); }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    detail::error_handler().on_error(message);
+  }
+};
+
+FMT_BEGIN_DETAIL_NAMESPACE
 
 // Checks if a value fits in int - used to avoid warnings about comparing
 // signed and unsigned integers.
@@ -178,81 +221,38 @@ template <typename Char> class printf_width_handler {
   }
 };
 
-template <typename Char, typename Context>
-void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
-             basic_format_args<Context> args) {
-  Context(buffer_appender<Char>(buf), format, args).format();
-}
-}  // namespace detail
-
-// For printing into memory_buffer.
-template <typename Char, typename Context>
-FMT_DEPRECATED void printf(detail::buffer<Char>& buf,
-                           basic_string_view<Char> format,
-                           basic_format_args<Context> args) {
-  return detail::vprintf(buf, format, args);
-}
-using detail::vprintf;
-
-template <typename Char>
-class basic_printf_parse_context : public basic_format_parse_context<Char> {
-  using basic_format_parse_context<Char>::basic_format_parse_context;
-};
-template <typename OutputIt, typename Char> class basic_printf_context;
-
-/**
-  \rst
-  The ``printf`` argument formatter.
-  \endrst
- */
+// The ``printf`` argument formatter.
 template <typename OutputIt, typename Char>
-class printf_arg_formatter : public detail::arg_formatter_base<OutputIt, Char> {
- public:
-  using iterator = OutputIt;
-
+class printf_arg_formatter : public arg_formatter<Char> {
  private:
-  using char_type = Char;
-  using base = detail::arg_formatter_base<OutputIt, Char>;
+  using base = arg_formatter<Char>;
   using context_type = basic_printf_context<OutputIt, Char>;
+  using format_specs = basic_format_specs<Char>;
 
   context_type& context_;
 
-  void write_null_pointer(char) {
-    this->specs()->type = 0;
-    this->write("(nil)");
-  }
-
-  void write_null_pointer(wchar_t) {
-    this->specs()->type = 0;
-    this->write(L"(nil)");
+  OutputIt write_null_pointer(bool is_string = false) {
+    auto s = this->specs;
+    s.type = presentation_type::none;
+    return write_bytes(this->out, is_string ? "(null)" : "(nil)", s);
   }
 
  public:
-  using format_specs = typename base::format_specs;
+  printf_arg_formatter(OutputIt iter, format_specs& s, context_type& ctx)
+      : base{iter, s, locale_ref()}, context_(ctx) {}
 
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *buffer* is a reference to the output buffer and *specs* contains format
-    specifier information for standard argument types.
-    \endrst
-   */
-  printf_arg_formatter(iterator iter, format_specs& specs, context_type& ctx)
-      : base(iter, &specs, detail::locale_ref()), context_(ctx) {}
+  OutputIt operator()(monostate value) { return base::operator()(value); }
 
-  template <typename T, FMT_ENABLE_IF(fmt::detail::is_integral<T>::value)>
-  iterator operator()(T value) {
-    // MSVC2013 fails to compile separate overloads for bool and char_type so
-    // use std::is_same instead.
-    if (std::is_same<T, bool>::value) {
-      format_specs& fmt_specs = *this->specs();
-      if (fmt_specs.type != 's') return base::operator()(value ? 1 : 0);
-      fmt_specs.type = 0;
-      this->write(value != 0);
-    } else if (std::is_same<T, char_type>::value) {
-      format_specs& fmt_specs = *this->specs();
-      if (fmt_specs.type && fmt_specs.type != 'c')
+  template <typename T, FMT_ENABLE_IF(detail::is_integral<T>::value)>
+  OutputIt operator()(T value) {
+    // MSVC2013 fails to compile separate overloads for bool and Char so use
+    // std::is_same instead.
+    if (std::is_same<T, Char>::value) {
+      format_specs fmt_specs = this->specs;
+      if (fmt_specs.type != presentation_type::none &&
+          fmt_specs.type != presentation_type::chr) {
         return (*this)(static_cast<int>(value));
+      }
       fmt_specs.sign = sign::none;
       fmt_specs.alt = false;
       fmt_specs.fill[0] = ' ';  // Ignore '0' flag for char types.
@@ -260,138 +260,49 @@ class printf_arg_formatter : public detail::arg_formatter_base<OutputIt, Char> {
       // ignored for non-numeric types
       if (fmt_specs.align == align::none || fmt_specs.align == align::numeric)
         fmt_specs.align = align::right;
-      return base::operator()(value);
-    } else {
-      return base::operator()(value);
+      return write<Char>(this->out, static_cast<Char>(value), fmt_specs);
     }
-    return this->out();
+    return base::operator()(value);
   }
 
   template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-  iterator operator()(T value) {
+  OutputIt operator()(T value) {
     return base::operator()(value);
   }
 
   /** Formats a null-terminated C string. */
-  iterator operator()(const char* value) {
-    if (value)
-      base::operator()(value);
-    else if (this->specs()->type == 'p')
-      write_null_pointer(char_type());
-    else
-      this->write("(null)");
-    return this->out();
+  OutputIt operator()(const char* value) {
+    if (value) return base::operator()(value);
+    return write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
   /** Formats a null-terminated wide C string. */
-  iterator operator()(const wchar_t* value) {
-    if (value)
-      base::operator()(value);
-    else if (this->specs()->type == 'p')
-      write_null_pointer(char_type());
-    else
-      this->write(L"(null)");
-    return this->out();
+  OutputIt operator()(const wchar_t* value) {
+    if (value) return base::operator()(value);
+    return write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
-  iterator operator()(basic_string_view<char_type> value) {
+  OutputIt operator()(basic_string_view<Char> value) {
     return base::operator()(value);
   }
 
-  iterator operator()(monostate value) { return base::operator()(value); }
-
   /** Formats a pointer. */
-  iterator operator()(const void* value) {
-    if (value) return base::operator()(value);
-    this->specs()->type = 0;
-    write_null_pointer(char_type());
-    return this->out();
+  OutputIt operator()(const void* value) {
+    return value ? base::operator()(value) : write_null_pointer();
   }
 
   /** Formats an argument of a custom (user-defined) type. */
-  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
-    handle.format(context_.parse_context(), context_);
-    return this->out();
+  OutputIt operator()(typename basic_format_arg<context_type>::handle handle) {
+    auto parse_ctx =
+        basic_printf_parse_context<Char>(basic_string_view<Char>());
+    handle.format(parse_ctx, context_);
+    return this->out;
   }
 };
 
-template <typename T> struct printf_formatter {
-  printf_formatter() = delete;
-
-  template <typename ParseContext>
-  auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return ctx.begin();
-  }
-
-  template <typename FormatContext>
-  auto format(const T& value, FormatContext& ctx) -> decltype(ctx.out()) {
-    detail::format_value(detail::get_container(ctx.out()), value);
-    return ctx.out();
-  }
-};
-
-/**
- This template formats data and writes the output through an output iterator.
- */
-template <typename OutputIt, typename Char> class basic_printf_context {
- public:
-  /** The character type for the output. */
-  using char_type = Char;
-  using iterator = OutputIt;
-  using format_arg = basic_format_arg<basic_printf_context>;
-  using parse_context_type = basic_printf_parse_context<Char>;
-  template <typename T> using formatter_type = printf_formatter<T>;
-
- private:
-  using format_specs = basic_format_specs<char_type>;
-
-  OutputIt out_;
-  basic_format_args<basic_printf_context> args_;
-  parse_context_type parse_ctx_;
-
-  static void parse_flags(format_specs& specs, const Char*& it,
-                          const Char* end);
-
-  // Returns the argument with specified index or, if arg_index is -1, the next
-  // argument.
-  format_arg get_arg(int arg_index = -1);
-
-  // Parses argument index, flags and width and returns the argument index.
-  int parse_header(const Char*& it, const Char* end, format_specs& specs);
-
- public:
-  /**
-   \rst
-   Constructs a ``printf_context`` object. References to the arguments are
-   stored in the context object so make sure they have appropriate lifetimes.
-   \endrst
-   */
-  basic_printf_context(OutputIt out, basic_string_view<char_type> format_str,
-                       basic_format_args<basic_printf_context> args)
-      : out_(out), args_(args), parse_ctx_(format_str) {}
-
-  OutputIt out() { return out_; }
-  void advance_to(OutputIt it) { out_ = it; }
-
-  detail::locale_ref locale() { return {}; }
-
-  format_arg arg(int id) const { return args_.get(id); }
-
-  parse_context_type& parse_context() { return parse_ctx_; }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    parse_ctx_.on_error(message);
-  }
-
-  /** Formats stored arguments and writes the output to the range. */
-  template <typename ArgFormatter = printf_arg_formatter<OutputIt, Char>>
-  OutputIt format();
-};
-
-template <typename OutputIt, typename Char>
-void basic_printf_context<OutputIt, Char>::parse_flags(format_specs& specs,
-                                                       const Char*& it,
-                                                       const Char* end) {
+template <typename Char>
+void parse_flags(basic_format_specs<Char>& specs, const Char*& it,
+                 const Char* end) {
   for (; it != end; ++it) {
     switch (*it) {
     case '-':
@@ -417,35 +328,24 @@ void basic_printf_context<OutputIt, Char>::parse_flags(format_specs& specs,
   }
 }
 
-template <typename OutputIt, typename Char>
-typename basic_printf_context<OutputIt, Char>::format_arg
-basic_printf_context<OutputIt, Char>::get_arg(int arg_index) {
-  if (arg_index < 0)
-    arg_index = parse_ctx_.next_arg_id();
-  else
-    parse_ctx_.check_arg_id(--arg_index);
-  return detail::get_arg(*this, arg_index);
-}
-
-template <typename OutputIt, typename Char>
-int basic_printf_context<OutputIt, Char>::parse_header(const Char*& it,
-                                                       const Char* end,
-                                                       format_specs& specs) {
+template <typename Char, typename GetArg>
+int parse_header(const Char*& it, const Char* end,
+                 basic_format_specs<Char>& specs, GetArg get_arg) {
   int arg_index = -1;
-  char_type c = *it;
+  Char c = *it;
   if (c >= '0' && c <= '9') {
     // Parse an argument index (if followed by '$') or a width possibly
     // preceded with '0' flag(s).
-    detail::error_handler eh;
-    int value = parse_nonnegative_int(it, end, eh);
+    int value = parse_nonnegative_int(it, end, -1);
     if (it != end && *it == '$') {  // value is an argument index
       ++it;
-      arg_index = value;
+      arg_index = value != -1 ? value : max_value<int>();
     } else {
       if (c == '0') specs.fill[0] = '0';
       if (value != 0) {
         // Nonzero value means that we parsed width and don't need to
         // parse it or flags again, so return now.
+        if (value == -1) FMT_THROW(format_error("number is too big"));
         specs.width = value;
         return arg_index;
       }
@@ -455,58 +355,76 @@ int basic_printf_context<OutputIt, Char>::parse_header(const Char*& it,
   // Parse width.
   if (it != end) {
     if (*it >= '0' && *it <= '9') {
-      detail::error_handler eh;
-      specs.width = parse_nonnegative_int(it, end, eh);
+      specs.width = parse_nonnegative_int(it, end, -1);
+      if (specs.width == -1) FMT_THROW(format_error("number is too big"));
     } else if (*it == '*') {
       ++it;
       specs.width = static_cast<int>(visit_format_arg(
-          detail::printf_width_handler<char_type>(specs), get_arg()));
+          detail::printf_width_handler<Char>(specs), get_arg(-1)));
     }
   }
   return arg_index;
 }
 
-template <typename OutputIt, typename Char>
-template <typename ArgFormatter>
-OutputIt basic_printf_context<OutputIt, Char>::format() {
-  auto out = this->out();
-  const Char* start = parse_ctx_.begin();
-  const Char* end = parse_ctx_.end();
+template <typename Char, typename Context>
+void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
+             basic_format_args<Context> args) {
+  using OutputIt = buffer_appender<Char>;
+  auto out = OutputIt(buf);
+  auto context = basic_printf_context<OutputIt, Char>(out, args);
+  auto parse_ctx = basic_printf_parse_context<Char>(format);
+
+  // Returns the argument with specified index or, if arg_index is -1, the next
+  // argument.
+  auto get_arg = [&](int arg_index) {
+    if (arg_index < 0)
+      arg_index = parse_ctx.next_arg_id();
+    else
+      parse_ctx.check_arg_id(--arg_index);
+    return detail::get_arg(context, arg_index);
+  };
+
+  const Char* start = parse_ctx.begin();
+  const Char* end = parse_ctx.end();
   auto it = start;
   while (it != end) {
-    char_type c = *it++;
-    if (c != '%') continue;
+    if (!detail::find<false, Char>(it, end, '%', it)) {
+      it = end;  // detail::find leaves it == nullptr if it doesn't find '%'
+      break;
+    }
+    Char c = *it++;
     if (it != end && *it == c) {
-      out = std::copy(start, it, out);
+      out = detail::write(
+          out, basic_string_view<Char>(start, detail::to_unsigned(it - start)));
       start = ++it;
       continue;
     }
-    out = std::copy(start, it - 1, out);
+    out = detail::write(out, basic_string_view<Char>(
+                                 start, detail::to_unsigned(it - 1 - start)));
 
-    format_specs specs;
+    basic_format_specs<Char> specs;
     specs.align = align::right;
 
     // Parse argument index, flags and width.
-    int arg_index = parse_header(it, end, specs);
-    if (arg_index == 0) on_error("argument not found");
+    int arg_index = parse_header(it, end, specs, get_arg);
+    if (arg_index == 0) parse_ctx.on_error("argument not found");
 
     // Parse precision.
     if (it != end && *it == '.') {
       ++it;
       c = it != end ? *it : 0;
       if ('0' <= c && c <= '9') {
-        detail::error_handler eh;
-        specs.precision = parse_nonnegative_int(it, end, eh);
+        specs.precision = parse_nonnegative_int(it, end, 0);
       } else if (c == '*') {
         ++it;
         specs.precision = static_cast<int>(
-            visit_format_arg(detail::printf_precision_handler(), get_arg()));
+            visit_format_arg(detail::printf_precision_handler(), get_arg(-1)));
       } else {
         specs.precision = 0;
       }
     }
 
-    format_arg arg = get_arg(arg_index);
+    auto arg = get_arg(arg_index);
     // For d, i, o, u, x, and X conversion specifiers, if a precision is
     // specified, the '0' flag is ignored
     if (specs.precision >= 0 && arg.is_integral())
@@ -516,9 +434,10 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
       auto str = visit_format_arg(detail::get_cstring<Char>(), arg);
       auto str_end = str + specs.precision;
       auto nul = std::find(str, str_end, Char());
-      arg = detail::make_arg<basic_printf_context>(basic_string_view<Char>(
-          str,
-          detail::to_unsigned(nul != str_end ? nul - str : specs.precision)));
+      arg = detail::make_arg<basic_printf_context<OutputIt, Char>>(
+          basic_string_view<Char>(
+              str, detail::to_unsigned(nul != str_end ? nul - str
+                                                      : specs.precision)));
     }
     if (specs.alt && visit_format_arg(detail::is_zero_int(), arg))
       specs.alt = false;
@@ -532,7 +451,7 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
 
     // Parse length and convert the argument to the required type.
     c = it != end ? *it++ : 0;
-    char_type t = it != end ? *it : 0;
+    Char t = it != end ? *it : 0;
     using detail::convert_arg;
     switch (c) {
     case 'h':
@@ -573,28 +492,34 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
 
     // Parse type.
     if (it == end) FMT_THROW(format_error("invalid format string"));
-    specs.type = static_cast<char>(*it++);
+    char type = static_cast<char>(*it++);
     if (arg.is_integral()) {
       // Normalize type.
-      switch (specs.type) {
+      switch (type) {
       case 'i':
       case 'u':
-        specs.type = 'd';
+        type = 'd';
         break;
       case 'c':
-        visit_format_arg(detail::char_converter<basic_printf_context>(arg),
-                         arg);
+        visit_format_arg(
+            detail::char_converter<basic_printf_context<OutputIt, Char>>(arg),
+            arg);
         break;
       }
     }
+    specs.type = parse_presentation_type(type);
+    if (specs.type == presentation_type::none)
+      parse_ctx.on_error("invalid type specifier");
 
     start = it;
 
     // Format argument.
-    out = visit_format_arg(ArgFormatter(out, specs, *this), arg);
+    out = visit_format_arg(
+        detail::printf_arg_formatter<OutputIt, Char>(out, specs, context), arg);
   }
-  return std::copy(start, it, out);
+  detail::write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
 }
+FMT_END_DETAIL_NAMESPACE
 
 template <typename Char>
 using basic_printf_context_t =
@@ -612,9 +537,9 @@ using wprintf_args = basic_format_args<wprintf_context>;
   arguments and can be implicitly converted to `~fmt::printf_args`.
   \endrst
  */
-template <typename... Args>
-inline format_arg_store<printf_context, Args...> make_printf_args(
-    const Args&... args) {
+template <typename... T>
+inline auto make_printf_args(const T&... args)
+    -> format_arg_store<printf_context, T...> {
   return {args...};
 }
 
@@ -624,18 +549,19 @@ inline format_arg_store<printf_context, Args...> make_printf_args(
   arguments and can be implicitly converted to `~fmt::wprintf_args`.
   \endrst
  */
-template <typename... Args>
-inline format_arg_store<wprintf_context, Args...> make_wprintf_args(
-    const Args&... args) {
+template <typename... T>
+inline auto make_wprintf_args(const T&... args)
+    -> format_arg_store<wprintf_context, T...> {
   return {args...};
 }
 
 template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vsprintf(
-    const S& format,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
+inline auto vsprintf(
+    const S& fmt,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
   basic_memory_buffer<Char> buffer;
-  vprintf(buffer, to_string_view(format), args);
+  vprintf(buffer, to_string_view(fmt), args);
   return to_string(buffer);
 }
 
@@ -648,19 +574,20 @@ inline std::basic_string<Char> vsprintf(
     std::string message = fmt::sprintf("The answer is %d", 42);
   \endrst
 */
-template <typename S, typename... Args,
+template <typename S, typename... T,
           typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
-inline std::basic_string<Char> sprintf(const S& format, const Args&... args) {
+inline auto sprintf(const S& fmt, const T&... args) -> std::basic_string<Char> {
   using context = basic_printf_context_t<Char>;
-  return vsprintf(to_string_view(format), make_format_args<context>(args...));
+  return vsprintf(to_string_view(fmt), fmt::make_format_args<context>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vfprintf(
-    std::FILE* f, const S& format,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
+inline auto vfprintf(
+    std::FILE* f, const S& fmt,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+    -> int {
   basic_memory_buffer<Char> buffer;
-  vprintf(buffer, to_string_view(format), args);
+  vprintf(buffer, to_string_view(fmt), args);
   size_t size = buffer.size();
   return std::fwrite(buffer.data(), sizeof(Char), size, f) < size
              ? -1
@@ -676,19 +603,19 @@ inline int vfprintf(
     fmt::fprintf(stderr, "Don't %s!", "panic");
   \endrst
  */
-template <typename S, typename... Args,
-          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
-inline int fprintf(std::FILE* f, const S& format, const Args&... args) {
+template <typename S, typename... T, typename Char = char_t<S>>
+inline auto fprintf(std::FILE* f, const S& fmt, const T&... args) -> int {
   using context = basic_printf_context_t<Char>;
-  return vfprintf(f, to_string_view(format),
-                  make_format_args<context>(args...));
+  return vfprintf(f, to_string_view(fmt),
+                  fmt::make_format_args<context>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vprintf(
-    const S& format,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
-  return vfprintf(stdout, to_string_view(format), args);
+inline auto vprintf(
+    const S& fmt,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+    -> int {
+  return vfprintf(stdout, to_string_view(fmt), args);
 }
 
 /**
@@ -700,52 +627,31 @@ inline int vprintf(
     fmt::printf("Elapsed time: %.2f seconds", 1.23);
   \endrst
  */
-template <typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_string<S>::value)>
-inline int printf(const S& format_str, const Args&... args) {
-  using context = basic_printf_context_t<char_t<S>>;
-  return vprintf(to_string_view(format_str),
-                 make_format_args<context>(args...));
+template <typename S, typename... T, FMT_ENABLE_IF(detail::is_string<S>::value)>
+inline auto printf(const S& fmt, const T&... args) -> int {
+  return vprintf(
+      to_string_view(fmt),
+      fmt::make_format_args<basic_printf_context_t<char_t<S>>>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vfprintf(
-    std::basic_ostream<Char>& os, const S& format,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
+FMT_DEPRECATED auto vfprintf(
+    std::basic_ostream<Char>& os, const S& fmt,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+    -> int {
   basic_memory_buffer<Char> buffer;
-  vprintf(buffer, to_string_view(format), args);
-  detail::write_buffer(os, buffer);
+  vprintf(buffer, to_string_view(fmt), args);
+  os.write(buffer.data(), static_cast<std::streamsize>(buffer.size()));
   return static_cast<int>(buffer.size());
 }
-
-/** Formats arguments and writes the output to the range. */
-template <typename ArgFormatter, typename Char,
-          typename Context =
-              basic_printf_context<typename ArgFormatter::iterator, Char>>
-typename ArgFormatter::iterator vprintf(
-    detail::buffer<Char>& out, basic_string_view<Char> format_str,
-    basic_format_args<type_identity_t<Context>> args) {
-  typename ArgFormatter::iterator iter(out);
-  Context(iter, format_str, args).template format<ArgFormatter>();
-  return iter;
+template <typename S, typename... T, typename Char = char_t<S>>
+FMT_DEPRECATED auto fprintf(std::basic_ostream<Char>& os, const S& fmt,
+                            const T&... args) -> int {
+  return vfprintf(os, to_string_view(fmt),
+                  fmt::make_format_args<basic_printf_context_t<Char>>(args...));
 }
 
-/**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    fmt::fprintf(cerr, "Don't %s!", "panic");
-  \endrst
- */
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline int fprintf(std::basic_ostream<Char>& os, const S& format_str,
-                   const Args&... args) {
-  using context = basic_printf_context_t<Char>;
-  return vfprintf(os, to_string_view(format_str),
-                  make_format_args<context>(args...));
-}
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_PRINTF_H_
diff --git a/src/fmt/ranges.h b/src/fmt/ranges.h
index 632f04949c..eb9fb8a92d 100644
--- a/src/fmt/ranges.h
+++ b/src/fmt/ranges.h
@@ -13,47 +13,13 @@
 #define FMT_RANGES_H_
 
 #include <initializer_list>
+#include <tuple>
 #include <type_traits>
 
 #include "format.h"
 
-// output only up to N items from the range.
-#ifndef FMT_RANGE_OUTPUT_LENGTH_LIMIT
-#  define FMT_RANGE_OUTPUT_LENGTH_LIMIT 256
-#endif
-
 FMT_BEGIN_NAMESPACE
 
-template <typename Char> struct formatting_base {
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return ctx.begin();
-  }
-};
-
-template <typename Char, typename Enable = void>
-struct formatting_range : formatting_base<Char> {
-  static FMT_CONSTEXPR_DECL const size_t range_length_limit =
-      FMT_RANGE_OUTPUT_LENGTH_LIMIT;  // output only up to N items from the
-                                      // range.
-  Char prefix;
-  Char delimiter;
-  Char postfix;
-  formatting_range() : prefix('{'), delimiter(','), postfix('}') {}
-  static FMT_CONSTEXPR_DECL const bool add_delimiter_spaces = true;
-  static FMT_CONSTEXPR_DECL const bool add_prepostfix_space = false;
-};
-
-template <typename Char, typename Enable = void>
-struct formatting_tuple : formatting_base<Char> {
-  Char prefix;
-  Char delimiter;
-  Char postfix;
-  formatting_tuple() : prefix('('), delimiter(','), postfix(')') {}
-  static FMT_CONSTEXPR_DECL const bool add_delimiter_spaces = true;
-  static FMT_CONSTEXPR_DECL const bool add_prepostfix_space = false;
-};
-
 namespace detail {
 
 template <typename RangeT, typename OutputIterator>
@@ -75,8 +41,14 @@ OutputIterator copy(char ch, OutputIterator out) {
   return out;
 }
 
-/// Return true value if T has std::string interface, like std::string_view.
-template <typename T> class is_like_std_string {
+template <typename OutputIterator>
+OutputIterator copy(wchar_t ch, OutputIterator out) {
+  *out++ = ch;
+  return out;
+}
+
+// Returns true if T has a std::string-like interface, like std::string_view.
+template <typename T> class is_std_string_like {
   template <typename U>
   static auto check(U* p)
       -> decltype((void)p->find('a'), p->length(), (void)p->data(), int());
@@ -84,26 +56,118 @@ template <typename T> class is_like_std_string {
 
  public:
   static FMT_CONSTEXPR_DECL const bool value =
-      is_string<T>::value || !std::is_void<decltype(check<T>(nullptr))>::value;
+      is_string<T>::value ||
+      std::is_convertible<T, std_string_view<char>>::value ||
+      !std::is_void<decltype(check<T>(nullptr))>::value;
 };
 
 template <typename Char>
-struct is_like_std_string<fmt::basic_string_view<Char>> : std::true_type {};
+struct is_std_string_like<fmt::basic_string_view<Char>> : std::true_type {};
+
+template <typename T> class is_map {
+  template <typename U> static auto check(U*) -> typename U::mapped_type;
+  template <typename> static void check(...);
+
+ public:
+#ifdef FMT_FORMAT_MAP_AS_LIST
+  static FMT_CONSTEXPR_DECL const bool value = false;
+#else
+  static FMT_CONSTEXPR_DECL const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+#endif
+};
+
+template <typename T> class is_set {
+  template <typename U> static auto check(U*) -> typename U::key_type;
+  template <typename> static void check(...);
+
+ public:
+#ifdef FMT_FORMAT_SET_AS_LIST
+  static FMT_CONSTEXPR_DECL const bool value = false;
+#else
+  static FMT_CONSTEXPR_DECL const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value && !is_map<T>::value;
+#endif
+};
 
 template <typename... Ts> struct conditional_helper {};
 
 template <typename T, typename _ = void> struct is_range_ : std::false_type {};
 
 #if !FMT_MSC_VER || FMT_MSC_VER > 1800
+
+#  define FMT_DECLTYPE_RETURN(val)  \
+    ->decltype(val) { return val; } \
+    static_assert(                  \
+        true, "")  // This makes it so that a semicolon is required after the
+                   // macro, which helps clang-format handle the formatting.
+
+// C array overload
+template <typename T, std::size_t N>
+auto range_begin(const T (&arr)[N]) -> const T* {
+  return arr;
+}
+template <typename T, std::size_t N>
+auto range_end(const T (&arr)[N]) -> const T* {
+  return arr + N;
+}
+
+template <typename T, typename Enable = void>
+struct has_member_fn_begin_end_t : std::false_type {};
+
 template <typename T>
-struct is_range_<
-    T, conditional_t<false,
-                     conditional_helper<decltype(std::declval<T>().begin()),
-                                        decltype(std::declval<T>().end())>,
-                     void>> : std::true_type {};
+struct has_member_fn_begin_end_t<T, void_t<decltype(std::declval<T>().begin()),
+                                           decltype(std::declval<T>().end())>>
+    : std::true_type {};
+
+// Member function overload
+template <typename T>
+auto range_begin(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).begin());
+template <typename T>
+auto range_end(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).end());
+
+// ADL overload. Only participates in overload resolution if member functions
+// are not found.
+template <typename T>
+auto range_begin(T&& rng)
+    -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                   decltype(begin(static_cast<T&&>(rng)))> {
+  return begin(static_cast<T&&>(rng));
+}
+template <typename T>
+auto range_end(T&& rng) -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                                       decltype(end(static_cast<T&&>(rng)))> {
+  return end(static_cast<T&&>(rng));
+}
+
+template <typename T, typename Enable = void>
+struct has_const_begin_end : std::false_type {};
+template <typename T, typename Enable = void>
+struct has_mutable_begin_end : std::false_type {};
+
+template <typename T>
+struct has_const_begin_end<
+    T,
+    void_t<
+        decltype(detail::range_begin(std::declval<const remove_cvref_t<T>&>())),
+        decltype(detail::range_end(std::declval<const remove_cvref_t<T>&>()))>>
+    : std::true_type {};
+
+template <typename T>
+struct has_mutable_begin_end<
+    T, void_t<decltype(detail::range_begin(std::declval<T>())),
+              decltype(detail::range_end(std::declval<T>())),
+              enable_if_t<std::is_copy_constructible<T>::value>>>
+    : std::true_type {};
+
+template <typename T>
+struct is_range_<T, void>
+    : std::integral_constant<bool, (has_const_begin_end<T>::value ||
+                                    has_mutable_begin_end<T>::value)> {};
+#  undef FMT_DECLTYPE_RETURN
 #endif
 
-/// tuple_size and tuple_element check.
+// tuple_size and tuple_element check.
 template <typename T> class is_tuple_like_ {
   template <typename U>
   static auto check(U* p) -> decltype(std::tuple_size<U>::value, int());
@@ -158,33 +222,321 @@ template <class Tuple, class F> void for_each(Tuple&& tup, F&& f) {
 }
 
 template <typename Range>
-using value_type = remove_cvref_t<decltype(*std::declval<Range>().begin())>;
+using value_type =
+    remove_cvref_t<decltype(*detail::range_begin(std::declval<Range>()))>;
 
-template <typename Arg, FMT_ENABLE_IF(!is_like_std_string<
-                                      typename std::decay<Arg>::type>::value)>
-FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const Arg&) {
-  return add_space ? " {}" : "{}";
+template <typename OutputIt> OutputIt write_delimiter(OutputIt out) {
+  *out++ = ',';
+  *out++ = ' ';
+  return out;
 }
 
-template <typename Arg, FMT_ENABLE_IF(is_like_std_string<
-                                      typename std::decay<Arg>::type>::value)>
-FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const Arg&) {
-  return add_space ? " \"{}\"" : "\"{}\"";
+struct singleton {
+  unsigned char upper;
+  unsigned char lower_count;
+};
+
+inline auto is_printable(uint16_t x, const singleton* singletons,
+                         size_t singletons_size,
+                         const unsigned char* singleton_lowers,
+                         const unsigned char* normal, size_t normal_size)
+    -> bool {
+  auto upper = x >> 8;
+  auto lower_start = 0;
+  for (size_t i = 0; i < singletons_size; ++i) {
+    auto s = singletons[i];
+    auto lower_end = lower_start + s.lower_count;
+    if (upper < s.upper) break;
+    if (upper == s.upper) {
+      for (auto j = lower_start; j < lower_end; ++j) {
+        if (singleton_lowers[j] == (x & 0xff)) return false;
+      }
+    }
+    lower_start = lower_end;
+  }
+
+  auto xsigned = static_cast<int>(x);
+  auto current = true;
+  for (size_t i = 0; i < normal_size; ++i) {
+    auto v = static_cast<int>(normal[i]);
+    auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v;
+    xsigned -= len;
+    if (xsigned < 0) break;
+    current = !current;
+  }
+  return current;
 }
 
-FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const char*) {
-  return add_space ? " \"{}\"" : "\"{}\"";
-}
-FMT_CONSTEXPR const wchar_t* format_str_quoted(bool add_space, const wchar_t*) {
-  return add_space ? L" \"{}\"" : L"\"{}\"";
+// Returns true iff the code point cp is printable.
+// This code is generated by support/printable.py.
+inline auto is_printable(uint32_t cp) -> bool {
+  static constexpr singleton singletons0[] = {
+      {0x00, 1},  {0x03, 5},  {0x05, 6},  {0x06, 3},  {0x07, 6},  {0x08, 8},
+      {0x09, 17}, {0x0a, 28}, {0x0b, 25}, {0x0c, 20}, {0x0d, 16}, {0x0e, 13},
+      {0x0f, 4},  {0x10, 3},  {0x12, 18}, {0x13, 9},  {0x16, 1},  {0x17, 5},
+      {0x18, 2},  {0x19, 3},  {0x1a, 7},  {0x1c, 2},  {0x1d, 1},  {0x1f, 22},
+      {0x20, 3},  {0x2b, 3},  {0x2c, 2},  {0x2d, 11}, {0x2e, 1},  {0x30, 3},
+      {0x31, 2},  {0x32, 1},  {0xa7, 2},  {0xa9, 2},  {0xaa, 4},  {0xab, 8},
+      {0xfa, 2},  {0xfb, 5},  {0xfd, 4},  {0xfe, 3},  {0xff, 9},
+  };
+  static constexpr unsigned char singletons0_lower[] = {
+      0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90,
+      0x1c, 0x1d, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f,
+      0x5c, 0x5d, 0x5f, 0xb5, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, 0xa9, 0xb1,
+      0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04,
+      0x11, 0x12, 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, 0x4a, 0x5d,
+      0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf,
+      0xe4, 0xe5, 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d,
+      0xc9, 0xce, 0xcf, 0x0d, 0x11, 0x29, 0x45, 0x49, 0x57, 0x64, 0x65, 0x8d,
+      0x91, 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, 0xe5, 0xf0, 0x0d,
+      0x11, 0x45, 0x49, 0x64, 0x65, 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5,
+      0xd7, 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, 0xbf, 0xc5, 0xc7,
+      0xce, 0xcf, 0xda, 0xdb, 0x48, 0x98, 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49,
+      0x4e, 0x4f, 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, 0xb6, 0xb7,
+      0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7,
+      0xfe, 0xff, 0x80, 0x0d, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x0f, 0x1f, 0x6e,
+      0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, 0xbb, 0xbc, 0xfa, 0x16,
+      0x17, 0x1e, 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, 0x5e, 0x7e,
+      0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f,
+      0x74, 0x75, 0x96, 0x2f, 0x5f, 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf,
+      0xc7, 0xcf, 0xd7, 0xdf, 0x9a, 0x40, 0x97, 0x98, 0x30, 0x8f, 0x1f, 0xc0,
+      0xc1, 0xce, 0xff, 0x4e, 0x4f, 0x5a, 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27,
+      0x2f, 0xee, 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, 0x90, 0x91,
+      0xfe, 0xff, 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7,
+      0xfe, 0xff,
+  };
+  static constexpr singleton singletons1[] = {
+      {0x00, 6},  {0x01, 1}, {0x03, 1},  {0x04, 2}, {0x08, 8},  {0x09, 2},
+      {0x0a, 5},  {0x0b, 2}, {0x0e, 4},  {0x10, 1}, {0x11, 2},  {0x12, 5},
+      {0x13, 17}, {0x14, 1}, {0x15, 2},  {0x17, 2}, {0x19, 13}, {0x1c, 5},
+      {0x1d, 8},  {0x24, 1}, {0x6a, 3},  {0x6b, 2}, {0xbc, 2},  {0xd1, 2},
+      {0xd4, 12}, {0xd5, 9}, {0xd6, 2},  {0xd7, 2}, {0xda, 1},  {0xe0, 5},
+      {0xe1, 2},  {0xe8, 2}, {0xee, 32}, {0xf0, 4}, {0xf8, 2},  {0xf9, 2},
+      {0xfa, 2},  {0xfb, 1},
+  };
+  static constexpr unsigned char singletons1_lower[] = {
+      0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x06, 0x07,
+      0x09, 0x36, 0x3d, 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, 0x36,
+      0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, 0xbd, 0x35, 0xe0, 0x12, 0x87,
+      0x89, 0x8e, 0x9e, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, 0x65, 0x5c, 0xb6, 0xb7, 0x1b,
+      0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, 0x39, 0x3a, 0xa8, 0xa9,
+      0xd8, 0xd9, 0x09, 0x37, 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66,
+      0x69, 0x8f, 0x92, 0x6f, 0x5f, 0xee, 0xef, 0x5a, 0x62, 0x9a, 0x9b, 0x27,
+      0x28, 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, 0xad, 0xba, 0xbc,
+      0xc4, 0x06, 0x0b, 0x0c, 0x15, 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7,
+      0xcc, 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, 0x3f, 0xc5, 0xc6,
+      0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, 0x3a, 0x48, 0x4a, 0x4c,
+      0x50, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66,
+      0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0,
+      0xae, 0xaf, 0x79, 0xcc, 0x6e, 0x6f, 0x93,
+  };
+  static constexpr unsigned char normal0[] = {
+      0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, 0x82, 0x44, 0x08, 0x1b, 0x04,
+      0x06, 0x11, 0x81, 0xac, 0x0e, 0x80, 0xab, 0x35, 0x28, 0x0b, 0x80, 0xe0,
+      0x03, 0x19, 0x08, 0x01, 0x04, 0x2f, 0x04, 0x34, 0x04, 0x07, 0x03, 0x01,
+      0x07, 0x06, 0x07, 0x11, 0x0a, 0x50, 0x0f, 0x12, 0x07, 0x55, 0x07, 0x03,
+      0x04, 0x1c, 0x0a, 0x09, 0x03, 0x08, 0x03, 0x07, 0x03, 0x02, 0x03, 0x03,
+      0x03, 0x0c, 0x04, 0x05, 0x03, 0x0b, 0x06, 0x01, 0x0e, 0x15, 0x05, 0x3a,
+      0x03, 0x11, 0x07, 0x06, 0x05, 0x10, 0x07, 0x57, 0x07, 0x02, 0x07, 0x15,
+      0x0d, 0x50, 0x04, 0x43, 0x03, 0x2d, 0x03, 0x01, 0x04, 0x11, 0x06, 0x0f,
+      0x0c, 0x3a, 0x04, 0x1d, 0x25, 0x5f, 0x20, 0x6d, 0x04, 0x6a, 0x25, 0x80,
+      0xc8, 0x05, 0x82, 0xb0, 0x03, 0x1a, 0x06, 0x82, 0xfd, 0x03, 0x59, 0x07,
+      0x15, 0x0b, 0x17, 0x09, 0x14, 0x0c, 0x14, 0x0c, 0x6a, 0x06, 0x0a, 0x06,
+      0x1a, 0x06, 0x59, 0x07, 0x2b, 0x05, 0x46, 0x0a, 0x2c, 0x04, 0x0c, 0x04,
+      0x01, 0x03, 0x31, 0x0b, 0x2c, 0x04, 0x1a, 0x06, 0x0b, 0x03, 0x80, 0xac,
+      0x06, 0x0a, 0x06, 0x21, 0x3f, 0x4c, 0x04, 0x2d, 0x03, 0x74, 0x08, 0x3c,
+      0x03, 0x0f, 0x03, 0x3c, 0x07, 0x38, 0x08, 0x2b, 0x05, 0x82, 0xff, 0x11,
+      0x18, 0x08, 0x2f, 0x11, 0x2d, 0x03, 0x20, 0x10, 0x21, 0x0f, 0x80, 0x8c,
+      0x04, 0x82, 0x97, 0x19, 0x0b, 0x15, 0x88, 0x94, 0x05, 0x2f, 0x05, 0x3b,
+      0x07, 0x02, 0x0e, 0x18, 0x09, 0x80, 0xb3, 0x2d, 0x74, 0x0c, 0x80, 0xd6,
+      0x1a, 0x0c, 0x05, 0x80, 0xff, 0x05, 0x80, 0xdf, 0x0c, 0xee, 0x0d, 0x03,
+      0x84, 0x8d, 0x03, 0x37, 0x09, 0x81, 0x5c, 0x14, 0x80, 0xb8, 0x08, 0x80,
+      0xcb, 0x2a, 0x38, 0x03, 0x0a, 0x06, 0x38, 0x08, 0x46, 0x08, 0x0c, 0x06,
+      0x74, 0x0b, 0x1e, 0x03, 0x5a, 0x04, 0x59, 0x09, 0x80, 0x83, 0x18, 0x1c,
+      0x0a, 0x16, 0x09, 0x4c, 0x04, 0x80, 0x8a, 0x06, 0xab, 0xa4, 0x0c, 0x17,
+      0x04, 0x31, 0xa1, 0x04, 0x81, 0xda, 0x26, 0x07, 0x0c, 0x05, 0x05, 0x80,
+      0xa5, 0x11, 0x81, 0x6d, 0x10, 0x78, 0x28, 0x2a, 0x06, 0x4c, 0x04, 0x80,
+      0x8d, 0x04, 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d,
+  };
+  static constexpr unsigned char normal1[] = {
+      0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, 0x2d, 0x03, 0x66, 0x03, 0x01, 0x2f,
+      0x2e, 0x80, 0x82, 0x1d, 0x03, 0x31, 0x0f, 0x1c, 0x04, 0x24, 0x09, 0x1e,
+      0x05, 0x2b, 0x05, 0x44, 0x04, 0x0e, 0x2a, 0x80, 0xaa, 0x06, 0x24, 0x04,
+      0x24, 0x04, 0x28, 0x08, 0x34, 0x0b, 0x01, 0x80, 0x90, 0x81, 0x37, 0x09,
+      0x16, 0x0a, 0x08, 0x80, 0x98, 0x39, 0x03, 0x63, 0x08, 0x09, 0x30, 0x16,
+      0x05, 0x21, 0x03, 0x1b, 0x05, 0x01, 0x40, 0x38, 0x04, 0x4b, 0x05, 0x2f,
+      0x04, 0x0a, 0x07, 0x09, 0x07, 0x40, 0x20, 0x27, 0x04, 0x0c, 0x09, 0x36,
+      0x03, 0x3a, 0x05, 0x1a, 0x07, 0x04, 0x0c, 0x07, 0x50, 0x49, 0x37, 0x33,
+      0x0d, 0x33, 0x07, 0x2e, 0x08, 0x0a, 0x81, 0x26, 0x52, 0x4e, 0x28, 0x08,
+      0x2a, 0x56, 0x1c, 0x14, 0x17, 0x09, 0x4e, 0x04, 0x1e, 0x0f, 0x43, 0x0e,
+      0x19, 0x07, 0x0a, 0x06, 0x48, 0x08, 0x27, 0x09, 0x75, 0x0b, 0x3f, 0x41,
+      0x2a, 0x06, 0x3b, 0x05, 0x0a, 0x06, 0x51, 0x06, 0x01, 0x05, 0x10, 0x03,
+      0x05, 0x80, 0x8b, 0x62, 0x1e, 0x48, 0x08, 0x0a, 0x80, 0xa6, 0x5e, 0x22,
+      0x45, 0x0b, 0x0a, 0x06, 0x0d, 0x13, 0x39, 0x07, 0x0a, 0x36, 0x2c, 0x04,
+      0x10, 0x80, 0xc0, 0x3c, 0x64, 0x53, 0x0c, 0x48, 0x09, 0x0a, 0x46, 0x45,
+      0x1b, 0x48, 0x08, 0x53, 0x1d, 0x39, 0x81, 0x07, 0x46, 0x0a, 0x1d, 0x03,
+      0x47, 0x49, 0x37, 0x03, 0x0e, 0x08, 0x0a, 0x06, 0x39, 0x07, 0x0a, 0x81,
+      0x36, 0x19, 0x80, 0xb7, 0x01, 0x0f, 0x32, 0x0d, 0x83, 0x9b, 0x66, 0x75,
+      0x0b, 0x80, 0xc4, 0x8a, 0xbc, 0x84, 0x2f, 0x8f, 0xd1, 0x82, 0x47, 0xa1,
+      0xb9, 0x82, 0x39, 0x07, 0x2a, 0x04, 0x02, 0x60, 0x26, 0x0a, 0x46, 0x0a,
+      0x28, 0x05, 0x13, 0x82, 0xb0, 0x5b, 0x65, 0x4b, 0x04, 0x39, 0x07, 0x11,
+      0x40, 0x05, 0x0b, 0x02, 0x0e, 0x97, 0xf8, 0x08, 0x84, 0xd6, 0x2a, 0x09,
+      0xa2, 0xf7, 0x81, 0x1f, 0x31, 0x03, 0x11, 0x04, 0x08, 0x81, 0x8c, 0x89,
+      0x04, 0x6b, 0x05, 0x0d, 0x03, 0x09, 0x07, 0x10, 0x93, 0x60, 0x80, 0xf6,
+      0x0a, 0x73, 0x08, 0x6e, 0x17, 0x46, 0x80, 0x9a, 0x14, 0x0c, 0x57, 0x09,
+      0x19, 0x80, 0x87, 0x81, 0x47, 0x03, 0x85, 0x42, 0x0f, 0x15, 0x85, 0x50,
+      0x2b, 0x80, 0xd5, 0x2d, 0x03, 0x1a, 0x04, 0x02, 0x81, 0x70, 0x3a, 0x05,
+      0x01, 0x85, 0x00, 0x80, 0xd7, 0x29, 0x4c, 0x04, 0x0a, 0x04, 0x02, 0x83,
+      0x11, 0x44, 0x4c, 0x3d, 0x80, 0xc2, 0x3c, 0x06, 0x01, 0x04, 0x55, 0x05,
+      0x1b, 0x34, 0x02, 0x81, 0x0e, 0x2c, 0x04, 0x64, 0x0c, 0x56, 0x0a, 0x80,
+      0xae, 0x38, 0x1d, 0x0d, 0x2c, 0x04, 0x09, 0x07, 0x02, 0x0e, 0x06, 0x80,
+      0x9a, 0x83, 0xd8, 0x08, 0x0d, 0x03, 0x0d, 0x03, 0x74, 0x0c, 0x59, 0x07,
+      0x0c, 0x14, 0x0c, 0x04, 0x38, 0x08, 0x0a, 0x06, 0x28, 0x08, 0x22, 0x4e,
+      0x81, 0x54, 0x0c, 0x15, 0x03, 0x03, 0x05, 0x07, 0x09, 0x19, 0x07, 0x07,
+      0x09, 0x03, 0x0d, 0x07, 0x29, 0x80, 0xcb, 0x25, 0x0a, 0x84, 0x06,
+  };
+  auto lower = static_cast<uint16_t>(cp);
+  if (cp < 0x10000) {
+    return is_printable(lower, singletons0,
+                        sizeof(singletons0) / sizeof(*singletons0),
+                        singletons0_lower, normal0, sizeof(normal0));
+  }
+  if (cp < 0x20000) {
+    return is_printable(lower, singletons1,
+                        sizeof(singletons1) / sizeof(*singletons1),
+                        singletons1_lower, normal1, sizeof(normal1));
+  }
+  if (0x2a6de <= cp && cp < 0x2a700) return false;
+  if (0x2b735 <= cp && cp < 0x2b740) return false;
+  if (0x2b81e <= cp && cp < 0x2b820) return false;
+  if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
+  if (0x2ebe1 <= cp && cp < 0x2f800) return false;
+  if (0x2fa1e <= cp && cp < 0x30000) return false;
+  if (0x3134b <= cp && cp < 0xe0100) return false;
+  if (0xe01f0 <= cp && cp < 0x110000) return false;
+  return cp < 0x110000;
 }
 
-FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const char) {
-  return add_space ? " '{}'" : "'{}'";
+inline auto needs_escape(uint32_t cp) -> bool {
+  return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
+         !is_printable(cp);
 }
-FMT_CONSTEXPR const wchar_t* format_str_quoted(bool add_space, const wchar_t) {
-  return add_space ? L" '{}'" : L"'{}'";
+
+template <typename Char> struct find_escape_result {
+  const Char* begin;
+  const Char* end;
+  uint32_t cp;
+};
+
+template <typename Char>
+auto find_escape(const Char* begin, const Char* end)
+    -> find_escape_result<Char> {
+  for (; begin != end; ++begin) {
+    auto cp = static_cast<typename std::make_unsigned<Char>::type>(*begin);
+    if (sizeof(Char) == 1 && cp >= 0x80) continue;
+    if (needs_escape(cp)) return {begin, begin + 1, cp};
+  }
+  return {begin, nullptr, 0};
 }
+
+inline auto find_escape(const char* begin, const char* end)
+    -> find_escape_result<char> {
+  if (!is_utf8()) return find_escape<char>(begin, end);
+  auto result = find_escape_result<char>{end, nullptr, 0};
+  for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
+                     [&](uint32_t cp, string_view sv) {
+                       if (needs_escape(cp)) {
+                         result = {sv.begin(), sv.end(), cp};
+                         return false;
+                       }
+                       return true;
+                     });
+  return result;
+}
+
+template <typename Char, typename OutputIt>
+auto write_range_entry(OutputIt out, basic_string_view<Char> str) -> OutputIt {
+  *out++ = '"';
+  auto begin = str.begin(), end = str.end();
+  do {
+    auto escape = find_escape(begin, end);
+    out = copy_str<Char>(begin, escape.begin, out);
+    begin = escape.end;
+    if (!begin) break;
+    auto c = static_cast<Char>(escape.cp);
+    switch (escape.cp) {
+    case '\n':
+      *out++ = '\\';
+      c = 'n';
+      break;
+    case '\r':
+      *out++ = '\\';
+      c = 'r';
+      break;
+    case '\t':
+      *out++ = '\\';
+      c = 't';
+      break;
+    case '"':
+      FMT_FALLTHROUGH;
+    case '\\':
+      *out++ = '\\';
+      break;
+    default:
+      if (is_utf8()) {
+        if (escape.cp < 0x100) {
+          out = format_to(out, "\\x{:02x}", escape.cp);
+          continue;
+        }
+        if (escape.cp < 0x10000) {
+          out = format_to(out, "\\u{:04x}", escape.cp);
+          continue;
+        }
+        if (escape.cp < 0x110000) {
+          out = format_to(out, "\\U{:08x}", escape.cp);
+          continue;
+        }
+      }
+      for (Char escape_char : basic_string_view<Char>(
+               escape.begin, to_unsigned(escape.end - escape.begin))) {
+        out = format_to(
+            out, "\\x{:02x}",
+            static_cast<typename std::make_unsigned<Char>::type>(escape_char));
+      }
+      continue;
+    }
+    *out++ = c;
+  } while (begin != end);
+  *out++ = '"';
+  return out;
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_convertible<T, std_string_view<char>>::value)>
+inline auto write_range_entry(OutputIt out, const T& str) -> OutputIt {
+  auto sv = std_string_view<Char>(str);
+  return write_range_entry<Char>(out, basic_string_view<Char>(sv));
+}
+
+template <typename Char, typename OutputIt, typename Arg,
+          FMT_ENABLE_IF(std::is_same<Arg, Char>::value)>
+OutputIt write_range_entry(OutputIt out, const Arg v) {
+  *out++ = '\'';
+  *out++ = v;
+  *out++ = '\'';
+  return out;
+}
+
+template <
+    typename Char, typename OutputIt, typename Arg,
+    FMT_ENABLE_IF(!is_std_string_like<typename std::decay<Arg>::type>::value &&
+                  !std::is_same<Arg, Char>::value)>
+OutputIt write_range_entry(OutputIt out, const Arg& v) {
+  return write<Char>(out, v);
+}
+
 }  // namespace detail
 
 template <typename T> struct is_tuple_like {
@@ -195,55 +547,37 @@ template <typename T> struct is_tuple_like {
 template <typename TupleT, typename Char>
 struct formatter<TupleT, Char, enable_if_t<fmt::is_tuple_like<TupleT>::value>> {
  private:
-  // C++11 generic lambda for format()
+  // C++11 generic lambda for format().
   template <typename FormatContext> struct format_each {
     template <typename T> void operator()(const T& v) {
-      if (i > 0) {
-        if (formatting.add_prepostfix_space) {
-          *out++ = ' ';
-        }
-        out = detail::copy(formatting.delimiter, out);
-      }
-      out = format_to(out,
-                      detail::format_str_quoted(
-                          (formatting.add_delimiter_spaces && i > 0), v),
-                      v);
+      if (i > 0) out = detail::write_delimiter(out);
+      out = detail::write_range_entry<Char>(out, v);
       ++i;
     }
-
-    formatting_tuple<Char>& formatting;
-    size_t& i;
-    typename std::add_lvalue_reference<decltype(
-        std::declval<FormatContext>().out())>::type out;
+    int i;
+    typename FormatContext::iterator& out;
   };
 
  public:
-  formatting_tuple<Char> formatting;
-
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return formatting.parse(ctx);
+    return ctx.begin();
   }
 
   template <typename FormatContext = format_context>
   auto format(const TupleT& values, FormatContext& ctx) -> decltype(ctx.out()) {
     auto out = ctx.out();
-    size_t i = 0;
-    detail::copy(formatting.prefix, out);
-
-    detail::for_each(values, format_each<FormatContext>{formatting, i, out});
-    if (formatting.add_prepostfix_space) {
-      *out++ = ' ';
-    }
-    detail::copy(formatting.postfix, out);
-
-    return ctx.out();
+    *out++ = '(';
+    detail::for_each(values, format_each<FormatContext>{0, out});
+    *out++ = ')';
+    return out;
   }
 };
 
 template <typename T, typename Char> struct is_range {
   static FMT_CONSTEXPR_DECL const bool value =
-      detail::is_range_<T>::value && !detail::is_like_std_string<T>::value &&
+      detail::is_range_<T>::value && !detail::is_std_string_like<T>::value &&
+      !detail::is_map<T>::value &&
       !std::is_convertible<T, std::basic_string<Char>>::value &&
       !std::is_constructible<detail::std_string_view<Char>, T>::value;
 };
@@ -251,100 +585,167 @@ template <typename T, typename Char> struct is_range {
 template <typename T, typename Char>
 struct formatter<
     T, Char,
-    enable_if_t<fmt::is_range<T, Char>::value
-// Workaround a bug in MSVC 2017 and earlier.
-#if !FMT_MSC_VER || FMT_MSC_VER >= 1927
-                &&
-                (has_formatter<detail::value_type<T>, format_context>::value ||
-                 detail::has_fallback_formatter<detail::value_type<T>,
-                                                format_context>::value)
+    enable_if_t<
+        fmt::is_range<T, Char>::value
+// Workaround a bug in MSVC 2019 and earlier.
+#if !FMT_MSC_VER
+        && (is_formattable<detail::value_type<T>, Char>::value ||
+            detail::has_fallback_formatter<detail::value_type<T>, Char>::value)
 #endif
-                >> {
-  formatting_range<Char> formatting;
-
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return formatting.parse(ctx);
-  }
-
-  template <typename FormatContext>
-  typename FormatContext::iterator format(const T& values, FormatContext& ctx) {
-    auto out = detail::copy(formatting.prefix, ctx.out());
-    size_t i = 0;
-    auto it = values.begin();
-    auto end = values.end();
-    for (; it != end; ++it) {
-      if (i > 0) {
-        if (formatting.add_prepostfix_space) *out++ = ' ';
-        out = detail::copy(formatting.delimiter, out);
-      }
-      out = format_to(out,
-                      detail::format_str_quoted(
-                          (formatting.add_delimiter_spaces && i > 0), *it),
-                      *it);
-      if (++i > formatting.range_length_limit) {
-        out = format_to(out, " ... <other elements>");
-        break;
-      }
-    }
-    if (formatting.add_prepostfix_space) *out++ = ' ';
-    return detail::copy(formatting.postfix, out);
-  }
-};
-
-template <typename Char, typename... T> struct tuple_arg_join : detail::view {
-  const std::tuple<T...>& tuple;
-  basic_string_view<Char> sep;
-
-  tuple_arg_join(const std::tuple<T...>& t, basic_string_view<Char> s)
-      : tuple{t}, sep{s} {}
-};
-
-template <typename Char, typename... T>
-struct formatter<tuple_arg_join<Char, T...>, Char> {
+        >> {
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
+  template <
+      typename FormatContext, typename U,
+      FMT_ENABLE_IF(
+          std::is_same<U, conditional_t<detail::has_const_begin_end<T>::value,
+                                        const T, T>>::value)>
+  auto format(U& range, FormatContext& ctx) -> decltype(ctx.out()) {
+#ifdef FMT_DEPRECATED_BRACED_RANGES
+    Char prefix = '{';
+    Char postfix = '}';
+#else
+    Char prefix = detail::is_set<T>::value ? '{' : '[';
+    Char postfix = detail::is_set<T>::value ? '}' : ']';
+#endif
+    auto out = ctx.out();
+    *out++ = prefix;
+    int i = 0;
+    auto it = std::begin(range);
+    auto end = std::end(range);
+    for (; it != end; ++it) {
+      if (i > 0) out = detail::write_delimiter(out);
+      out = detail::write_range_entry<Char>(out, *it);
+      ++i;
+    }
+    *out++ = postfix;
+    return out;
+  }
+};
+
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<
+        detail::is_map<T>::value
+// Workaround a bug in MSVC 2019 and earlier.
+#if !FMT_MSC_VER
+        && (is_formattable<detail::value_type<T>, Char>::value ||
+            detail::has_fallback_formatter<detail::value_type<T>, Char>::value)
+#endif
+        >> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <
+      typename FormatContext, typename U,
+      FMT_ENABLE_IF(
+          std::is_same<U, conditional_t<detail::has_const_begin_end<T>::value,
+                                        const T, T>>::value)>
+  auto format(U& map, FormatContext& ctx) -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    *out++ = '{';
+    int i = 0;
+    for (const auto& item : map) {
+      if (i > 0) out = detail::write_delimiter(out);
+      out = detail::write_range_entry<Char>(out, item.first);
+      *out++ = ':';
+      *out++ = ' ';
+      out = detail::write_range_entry<Char>(out, item.second);
+      ++i;
+    }
+    *out++ = '}';
+    return out;
+  }
+};
+
+template <typename Char, typename... T> struct tuple_join_view : detail::view {
+  const std::tuple<T...>& tuple;
+  basic_string_view<Char> sep;
+
+  tuple_join_view(const std::tuple<T...>& t, basic_string_view<Char> s)
+      : tuple(t), sep{s} {}
+};
+
+template <typename Char, typename... T>
+using tuple_arg_join = tuple_join_view<Char, T...>;
+
+// Define FMT_TUPLE_JOIN_SPECIFIERS to enable experimental format specifiers
+// support in tuple_join. It is disabled by default because of issues with
+// the dynamic width and precision.
+#ifndef FMT_TUPLE_JOIN_SPECIFIERS
+#  define FMT_TUPLE_JOIN_SPECIFIERS 0
+#endif
+
+template <typename Char, typename... T>
+struct formatter<tuple_join_view<Char, T...>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return do_parse(ctx, std::integral_constant<size_t, sizeof...(T)>());
+  }
+
   template <typename FormatContext>
-  typename FormatContext::iterator format(
-      const tuple_arg_join<Char, T...>& value, FormatContext& ctx) {
-    return format(value, ctx, detail::make_index_sequence<sizeof...(T)>{});
+  auto format(const tuple_join_view<Char, T...>& value,
+              FormatContext& ctx) const -> typename FormatContext::iterator {
+    return do_format(value, ctx,
+                     std::integral_constant<size_t, sizeof...(T)>());
   }
 
  private:
-  template <typename FormatContext, size_t... N>
-  typename FormatContext::iterator format(
-      const tuple_arg_join<Char, T...>& value, FormatContext& ctx,
-      detail::index_sequence<N...>) {
-    return format_args(value, ctx, std::get<N>(value.tuple)...);
+  std::tuple<formatter<typename std::decay<T>::type, Char>...> formatters_;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, 0>)
+      -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename ParseContext, size_t N>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, N>)
+      -> decltype(ctx.begin()) {
+    auto end = ctx.begin();
+#if FMT_TUPLE_JOIN_SPECIFIERS
+    end = std::get<sizeof...(T) - N>(formatters_).parse(ctx);
+    if (N > 1) {
+      auto end1 = do_parse(ctx, std::integral_constant<size_t, N - 1>());
+      if (end != end1)
+        FMT_THROW(format_error("incompatible format specs for tuple elements"));
+    }
+#endif
+    return end;
   }
 
   template <typename FormatContext>
-  typename FormatContext::iterator format_args(
-      const tuple_arg_join<Char, T...>&, FormatContext& ctx) {
-    // NOTE: for compilers that support C++17, this empty function instantiation
-    // can be replaced with a constexpr branch in the variadic overload.
+  auto do_format(const tuple_join_view<Char, T...>&, FormatContext& ctx,
+                 std::integral_constant<size_t, 0>) const ->
+      typename FormatContext::iterator {
     return ctx.out();
   }
 
-  template <typename FormatContext, typename Arg, typename... Args>
-  typename FormatContext::iterator format_args(
-      const tuple_arg_join<Char, T...>& value, FormatContext& ctx,
-      const Arg& arg, const Args&... args) {
-    using base = formatter<typename std::decay<Arg>::type, Char>;
-    auto out = ctx.out();
-    out = base{}.format(arg, ctx);
-    if (sizeof...(Args) > 0) {
+  template <typename FormatContext, size_t N>
+  auto do_format(const tuple_join_view<Char, T...>& value, FormatContext& ctx,
+                 std::integral_constant<size_t, N>) const ->
+      typename FormatContext::iterator {
+    auto out = std::get<sizeof...(T) - N>(formatters_)
+                   .format(std::get<sizeof...(T) - N>(value.tuple), ctx);
+    if (N > 1) {
       out = std::copy(value.sep.begin(), value.sep.end(), out);
       ctx.advance_to(out);
-      return format_args(value, ctx, args...);
+      return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
     }
     return out;
   }
 };
 
+FMT_MODULE_EXPORT_BEGIN
+
 /**
   \rst
   Returns an object that formats `tuple` with elements separated by `sep`.
@@ -357,14 +758,15 @@ struct formatter<tuple_arg_join<Char, T...>, Char> {
   \endrst
  */
 template <typename... T>
-FMT_CONSTEXPR tuple_arg_join<char, T...> join(const std::tuple<T...>& tuple,
-                                              string_view sep) {
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
+    -> tuple_join_view<char, T...> {
   return {tuple, sep};
 }
 
 template <typename... T>
-FMT_CONSTEXPR tuple_arg_join<wchar_t, T...> join(const std::tuple<T...>& tuple,
-                                                 wstring_view sep) {
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple,
+                        basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
   return {tuple, sep};
 }
 
@@ -380,17 +782,12 @@ FMT_CONSTEXPR tuple_arg_join<wchar_t, T...> join(const std::tuple<T...>& tuple,
   \endrst
  */
 template <typename T>
-arg_join<const T*, const T*, char> join(std::initializer_list<T> list,
-                                        string_view sep) {
-  return join(std::begin(list), std::end(list), sep);
-}
-
-template <typename T>
-arg_join<const T*, const T*, wchar_t> join(std::initializer_list<T> list,
-                                           wstring_view sep) {
+auto join(std::initializer_list<T> list, string_view sep)
+    -> join_view<const T*, const T*> {
   return join(std::begin(list), std::end(list), sep);
 }
 
+FMT_MODULE_EXPORT_END
 FMT_END_NAMESPACE
 
 #endif  // FMT_RANGES_H_
diff --git a/src/fmt/xchar.h b/src/fmt/xchar.h
new file mode 100644
index 0000000000..55825077f8
--- /dev/null
+++ b/src/fmt/xchar.h
@@ -0,0 +1,236 @@
+// Formatting library for C++ - optional wchar_t and exotic character support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_XCHAR_H_
+#define FMT_XCHAR_H_
+
+#include <cwchar>
+#include <tuple>
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+template <typename T>
+using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
+}
+
+FMT_MODULE_EXPORT_BEGIN
+
+using wstring_view = basic_string_view<wchar_t>;
+using wformat_parse_context = basic_format_parse_context<wchar_t>;
+using wformat_context = buffer_context<wchar_t>;
+using wformat_args = basic_format_args<wformat_context>;
+using wmemory_buffer = basic_memory_buffer<wchar_t>;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename... Args> using wformat_string = wstring_view;
+#else
+template <typename... Args>
+using wformat_string = basic_format_string<wchar_t, type_identity_t<Args>...>;
+#endif
+
+template <> struct is_char<wchar_t> : std::true_type {};
+template <> struct is_char<detail::char8_type> : std::true_type {};
+template <> struct is_char<char16_t> : std::true_type {};
+template <> struct is_char<char32_t> : std::true_type {};
+
+template <typename... Args>
+constexpr format_arg_store<wformat_context, Args...> make_wformat_args(
+    const Args&... args) {
+  return {args...};
+}
+
+inline namespace literals {
+constexpr auto operator"" _format(const wchar_t* s, size_t n)
+    -> detail::udl_formatter<wchar_t> {
+  return {{s, n}};
+}
+
+#if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_PARAMETERS
+constexpr detail::udl_arg<wchar_t> operator"" _a(const wchar_t* s, size_t) {
+  return {s};
+}
+#endif
+}  // namespace literals
+
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, wstring_view sep)
+    -> join_view<It, Sentinel, wchar_t> {
+  return {begin, end, sep};
+}
+
+template <typename Range>
+auto join(Range&& range, wstring_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>,
+                 wchar_t> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename T>
+auto join(std::initializer_list<T> list, wstring_view sep)
+    -> join_view<const T*, const T*, wchar_t> {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto vformat(basic_string_view<Char> format_str,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  basic_memory_buffer<Char> buffer;
+  detail::vformat_to(buffer, format_str, args);
+  return to_string(buffer);
+}
+
+// Pass char_t as a default template parameter instead of using
+// std::basic_string<char_t<S>> to reduce the symbol size.
+template <typename S, typename... Args, typename Char = char_t<S>,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto format(const S& format_str, Args&&... args) -> std::basic_string<Char> {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat(to_string_view(format_str), vargs);
+}
+
+template <typename Locale, typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat(
+    const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  return detail::vformat(loc, to_string_view(format_str), args);
+}
+
+template <typename Locale, typename S, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format(const Locale& loc, const S& format_str, Args&&... args)
+    -> std::basic_string<Char> {
+  return detail::vformat(loc, to_string_view(format_str),
+                         fmt::make_args_checked<Args...>(format_str, args...));
+}
+
+template <typename OutputIt, typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+auto vformat_to(OutputIt out, const S& format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  detail::vformat_to(buf, to_string_view(format_str), args);
+  return detail::get_iterator(buf);
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format_to(OutputIt out, const S& fmt, Args&&... args) -> OutputIt {
+  const auto& vargs = fmt::make_args_checked<Args...>(fmt, args...);
+  return vformat_to(out, to_string_view(fmt), vargs);
+}
+
+template <typename S, typename... Args, typename Char, size_t SIZE,
+          typename Allocator, FMT_ENABLE_IF(detail::is_string<S>::value)>
+FMT_DEPRECATED auto format_to(basic_memory_buffer<Char, SIZE, Allocator>& buf,
+                              const S& format_str, Args&&... args) ->
+    typename buffer_context<Char>::iterator {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  detail::vformat_to(buf, to_string_view(format_str), vargs, {});
+  return detail::buffer_appender<Char>(buf);
+}
+
+template <typename Locale, typename S, typename OutputIt, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_locale<Locale>::value&&
+                                detail::is_exotic_char<Char>::value)>
+inline auto vformat_to(
+    OutputIt out, const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc));
+  return detail::get_iterator(buf);
+}
+
+template <
+    typename OutputIt, typename Locale, typename S, typename... Args,
+    typename Char = char_t<S>,
+    bool enable = detail::is_output_iterator<OutputIt, Char>::value&&
+        detail::is_locale<Locale>::value&& detail::is_exotic_char<Char>::value>
+inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
+                      Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to(out, loc, to_string_view(format_str), vargs);
+}
+
+template <typename OutputIt, typename Char, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat_to_n(
+    OutputIt out, size_t n, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> format_to_n_result<OutputIt> {
+  detail::iterator_buffer<OutputIt, Char, detail::fixed_buffer_traits> buf(out,
+                                                                           n);
+  detail::vformat_to(buf, format_str, args);
+  return {buf.out(), buf.count()};
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format_to_n(OutputIt out, size_t n, const S& fmt,
+                        const Args&... args) -> format_to_n_result<OutputIt> {
+  const auto& vargs = fmt::make_args_checked<Args...>(fmt, args...);
+  return vformat_to_n(out, n, to_string_view(fmt), vargs);
+}
+
+template <typename S, typename... Args, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_exotic_char<Char>::value)>
+inline auto formatted_size(const S& fmt, Args&&... args) -> size_t {
+  detail::counting_buffer<Char> buf;
+  const auto& vargs = fmt::make_args_checked<Args...>(fmt, args...);
+  detail::vformat_to(buf, to_string_view(fmt), vargs);
+  return buf.count();
+}
+
+inline void vprint(std::FILE* f, wstring_view fmt, wformat_args args) {
+  wmemory_buffer buffer;
+  detail::vformat_to(buffer, fmt, args);
+  buffer.push_back(L'\0');
+  if (std::fputws(buffer.data(), f) == -1)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+inline void vprint(wstring_view fmt, wformat_args args) {
+  vprint(stdout, fmt, args);
+}
+
+template <typename... T>
+void print(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return vprint(f, wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename... T> void print(wformat_string<T...> fmt, T&&... args) {
+  return vprint(wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+/**
+  Converts *value* to ``std::wstring`` using the default format for type *T*.
+ */
+template <typename T> inline auto to_wstring(const T& value) -> std::wstring {
+  return format(FMT_STRING(L"{}"), value);
+}
+FMT_MODULE_EXPORT_END
+FMT_END_NAMESPACE
+
+#endif  // FMT_XCHAR_H_
diff --git a/src/fmtlib_format.cpp b/src/fmtlib_format.cpp
index ff95f64d3e..8978dab8bc 100644
--- a/src/fmtlib_format.cpp
+++ b/src/fmtlib_format.cpp
@@ -11,6 +11,52 @@
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
+// DEPRECATED!
+template <typename T = void> struct basic_data {
+  FMT_API static constexpr const char digits[100][2] = {
+      {'0', '0'}, {'0', '1'}, {'0', '2'}, {'0', '3'}, {'0', '4'}, {'0', '5'},
+      {'0', '6'}, {'0', '7'}, {'0', '8'}, {'0', '9'}, {'1', '0'}, {'1', '1'},
+      {'1', '2'}, {'1', '3'}, {'1', '4'}, {'1', '5'}, {'1', '6'}, {'1', '7'},
+      {'1', '8'}, {'1', '9'}, {'2', '0'}, {'2', '1'}, {'2', '2'}, {'2', '3'},
+      {'2', '4'}, {'2', '5'}, {'2', '6'}, {'2', '7'}, {'2', '8'}, {'2', '9'},
+      {'3', '0'}, {'3', '1'}, {'3', '2'}, {'3', '3'}, {'3', '4'}, {'3', '5'},
+      {'3', '6'}, {'3', '7'}, {'3', '8'}, {'3', '9'}, {'4', '0'}, {'4', '1'},
+      {'4', '2'}, {'4', '3'}, {'4', '4'}, {'4', '5'}, {'4', '6'}, {'4', '7'},
+      {'4', '8'}, {'4', '9'}, {'5', '0'}, {'5', '1'}, {'5', '2'}, {'5', '3'},
+      {'5', '4'}, {'5', '5'}, {'5', '6'}, {'5', '7'}, {'5', '8'}, {'5', '9'},
+      {'6', '0'}, {'6', '1'}, {'6', '2'}, {'6', '3'}, {'6', '4'}, {'6', '5'},
+      {'6', '6'}, {'6', '7'}, {'6', '8'}, {'6', '9'}, {'7', '0'}, {'7', '1'},
+      {'7', '2'}, {'7', '3'}, {'7', '4'}, {'7', '5'}, {'7', '6'}, {'7', '7'},
+      {'7', '8'}, {'7', '9'}, {'8', '0'}, {'8', '1'}, {'8', '2'}, {'8', '3'},
+      {'8', '4'}, {'8', '5'}, {'8', '6'}, {'8', '7'}, {'8', '8'}, {'8', '9'},
+      {'9', '0'}, {'9', '1'}, {'9', '2'}, {'9', '3'}, {'9', '4'}, {'9', '5'},
+      {'9', '6'}, {'9', '7'}, {'9', '8'}, {'9', '9'}};
+  FMT_API static constexpr const char hex_digits[] = "0123456789abcdef";
+  FMT_API static constexpr const char signs[4] = {0, '-', '+', ' '};
+  FMT_API static constexpr const char left_padding_shifts[5] = {31, 31, 0, 1,
+                                                                0};
+  FMT_API static constexpr const char right_padding_shifts[5] = {0, 31, 0, 1,
+                                                                 0};
+  FMT_API static constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
+                                                         0x1000000u | ' '};
+};
+
+#ifdef FMT_SHARED
+// Required for -flto, -fivisibility=hidden and -shared to work
+extern template struct basic_data<void>;
+#endif
+
+#if __cplusplus < 201703L
+// DEPRECATED! These are here only for ABI compatiblity.
+template <typename T> constexpr const char basic_data<T>::digits[][2];
+template <typename T> constexpr const char basic_data<T>::hex_digits[];
+template <typename T> constexpr const char basic_data<T>::signs[];
+template <typename T> constexpr const char basic_data<T>::left_padding_shifts[];
+template <typename T>
+constexpr const char basic_data<T>::right_padding_shifts[];
+template <typename T> constexpr const unsigned basic_data<T>::prefixes[];
+#endif
+
 template <typename T>
 int format_float(char* buf, std::size_t size, const char* format, int precision,
                  T value) {
@@ -29,35 +75,8 @@ template FMT_API dragonbox::decimal_fp<float> dragonbox::to_decimal(float x)
     FMT_NOEXCEPT;
 template FMT_API dragonbox::decimal_fp<double> dragonbox::to_decimal(double x)
     FMT_NOEXCEPT;
-
-// DEPRECATED! This function exists for ABI compatibility.
-template <typename Char>
-typename basic_format_context<std::back_insert_iterator<buffer<Char>>,
-                              Char>::iterator
-vformat_to(buffer<Char>& buf, basic_string_view<Char> format_str,
-           basic_format_args<basic_format_context<
-               std::back_insert_iterator<buffer<type_identity_t<Char>>>,
-               type_identity_t<Char>>>
-               args) {
-  using iterator = std::back_insert_iterator<buffer<char>>;
-  using context = basic_format_context<
-      std::back_insert_iterator<buffer<type_identity_t<Char>>>,
-      type_identity_t<Char>>;
-  auto out = iterator(buf);
-  format_handler<iterator, Char, context> h(out, format_str, args, {});
-  parse_format_string<false>(format_str, h);
-  return out;
-}
-template basic_format_context<std::back_insert_iterator<buffer<char>>,
-                              char>::iterator
-vformat_to(buffer<char>&, string_view,
-           basic_format_args<basic_format_context<
-               std::back_insert_iterator<buffer<type_identity_t<char>>>,
-               type_identity_t<char>>>);
 }  // namespace detail
 
-template struct FMT_INSTANTIATION_DEF_API detail::basic_data<void>;
-
 // Workaround a bug in MSVC2013 that prevents instantiation of format_float.
 int (*instantiate_format_float)(double, int, detail::float_specs,
                                 detail::buffer<char>&) = detail::format_float;
@@ -69,12 +88,15 @@ template FMT_API std::locale detail::locale_ref::get<std::locale>() const;
 
 // Explicit instantiations for char.
 
-template FMT_API std::string detail::grouping_impl<char>(locale_ref);
-template FMT_API char detail::thousands_sep_impl(locale_ref);
+template FMT_API auto detail::thousands_sep_impl(locale_ref)
+    -> thousands_sep_result<char>;
 template FMT_API char detail::decimal_point_impl(locale_ref);
 
 template FMT_API void detail::buffer<char>::append(const char*, const char*);
 
+// DEPRECATED!
+// There is no correspondent extern template in format.h because of
+// incompatibility between clang and gcc (#2377).
 template FMT_API void detail::vformat_to(
     detail::buffer<char>&, string_view,
     basic_format_args<FMT_BUFFER_CONTEXT(char)>, detail::locale_ref);
@@ -91,10 +113,13 @@ template FMT_API int detail::format_float(long double, int, detail::float_specs,
 
 // Explicit instantiations for wchar_t.
 
-template FMT_API std::string detail::grouping_impl<wchar_t>(locale_ref);
-template FMT_API wchar_t detail::thousands_sep_impl(locale_ref);
+template FMT_API auto detail::thousands_sep_impl(locale_ref)
+    -> thousands_sep_result<wchar_t>;
 template FMT_API wchar_t detail::decimal_point_impl(locale_ref);
 
 template FMT_API void detail::buffer<wchar_t>::append(const wchar_t*,
                                                       const wchar_t*);
+
+template struct detail::basic_data<void>;
+
 FMT_END_NAMESPACE
diff --git a/src/fmtlib_os.cpp b/src/fmtlib_os.cpp
index 8cb2bf5e35..d227ce44ee 100644
--- a/src/fmtlib_os.cpp
+++ b/src/fmtlib_os.cpp
@@ -26,21 +26,18 @@
 #      define WIN32_LEAN_AND_MEAN
 #    endif
 #    include <io.h>
-#    include <windows.h>
-
-#    define O_CREAT _O_CREAT
-#    define O_TRUNC _O_TRUNC
 
 #    ifndef S_IRUSR
 #      define S_IRUSR _S_IREAD
 #    endif
-
 #    ifndef S_IWUSR
 #      define S_IWUSR _S_IWRITE
 #    endif
-
-#    ifdef __MINGW32__
-#      define _SH_DENYNO 0x40
+#    ifndef S_IRGRP
+#      define S_IRGRP 0
+#    endif
+#    ifndef S_IROTH
+#      define S_IROTH 0
 #    endif
 #  endif  // _WIN32
 #endif    // FMT_USE_FCNTL
@@ -56,7 +53,7 @@
 namespace {
 #ifdef _WIN32
 // Return type of read and write functions.
-using RWResult = int;
+using rwresult = int;
 
 // On Windows the count argument to read and write is unsigned, so convert
 // it from size_t preventing integer overflow.
@@ -65,7 +62,7 @@ inline unsigned convert_rwcount(std::size_t count) {
 }
 #elif FMT_USE_FCNTL
 // Return type of read and write functions.
-using RWResult = ssize_t;
+using rwresult = ssize_t;
 
 inline std::size_t convert_rwcount(std::size_t count) { return count; }
 #endif
@@ -74,14 +71,14 @@ inline std::size_t convert_rwcount(std::size_t count) { return count; }
 FMT_BEGIN_NAMESPACE
 
 #ifdef _WIN32
-detail::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
+detail::utf16_to_utf8::utf16_to_utf8(basic_string_view<wchar_t> s) {
   if (int error_code = convert(s)) {
     FMT_THROW(windows_error(error_code,
                             "cannot convert string from UTF-16 to UTF-8"));
   }
 }
 
-int detail::utf16_to_utf8::convert(wstring_view s) {
+int detail::utf16_to_utf8::convert(basic_string_view<wchar_t> s) {
   if (s.size() > INT_MAX) return ERROR_INVALID_PARAMETER;
   int s_size = static_cast<int>(s.size());
   if (s_size == 0) {
@@ -102,46 +99,85 @@ int detail::utf16_to_utf8::convert(wstring_view s) {
   return 0;
 }
 
-void windows_error::init(int err_code, string_view format_str,
-                         format_args args) {
-  error_code_ = err_code;
-  memory_buffer buffer;
-  detail::format_windows_error(buffer, err_code, vformat(format_str, args));
-  std::runtime_error& base = *this;
-  base = std::runtime_error(to_string(buffer));
+namespace detail {
+
+class system_message {
+  system_message(const system_message&) = delete;
+  void operator=(const system_message&) = delete;
+
+  unsigned long result_;
+  wchar_t* message_;
+
+  static bool is_whitespace(wchar_t c) FMT_NOEXCEPT {
+    return c == L' ' || c == L'\n' || c == L'\r' || c == L'\t' || c == L'\0';
+  }
+
+ public:
+  explicit system_message(unsigned long error_code)
+      : result_(0), message_(nullptr) {
+    result_ = FormatMessageW(
+        FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+        nullptr, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+        reinterpret_cast<wchar_t*>(&message_), 0, nullptr);
+    if (result_ != 0) {
+      while (result_ != 0 && is_whitespace(message_[result_ - 1])) {
+        --result_;
+      }
+    }
+  }
+  ~system_message() { LocalFree(message_); }
+  explicit operator bool() const FMT_NOEXCEPT { return result_ != 0; }
+  operator basic_string_view<wchar_t>() const FMT_NOEXCEPT {
+    return basic_string_view<wchar_t>(message_, result_);
+  }
+};
+
+class utf8_system_category final : public std::error_category {
+ public:
+  const char* name() const FMT_NOEXCEPT override { return "system"; }
+  std::string message(int error_code) const override {
+    system_message msg(error_code);
+    if (msg) {
+      utf16_to_utf8 utf8_message;
+      if (utf8_message.convert(msg) == ERROR_SUCCESS) {
+        return utf8_message.str();
+      }
+    }
+    return "unknown error";
+  }
+};
+
+}  // namespace detail
+
+FMT_API const std::error_category& system_category() FMT_NOEXCEPT {
+  static const detail::utf8_system_category category;
+  return category;
+}
+
+std::system_error vwindows_error(int err_code, string_view format_str,
+                                 format_args args) {
+  auto ec = std::error_code(err_code, system_category());
+  return std::system_error(ec, vformat(format_str, args));
 }
 
 void detail::format_windows_error(detail::buffer<char>& out, int error_code,
-                                  string_view message) FMT_NOEXCEPT {
+                                  const char* message) FMT_NOEXCEPT {
   FMT_TRY {
-    wmemory_buffer buf;
-    buf.resize(inline_buffer_size);
-    for (;;) {
-      wchar_t* system_message = &buf[0];
-      int result = FormatMessageW(
-          FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
-          error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), system_message,
-          static_cast<uint32_t>(buf.size()), nullptr);
-      if (result != 0) {
-        utf16_to_utf8 utf8_message;
-        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
-          format_to(buffer_appender<char>(out), "{}: {}", message,
-                    utf8_message);
-          return;
-        }
-        break;
+    system_message msg(error_code);
+    if (msg) {
+      utf16_to_utf8 utf8_message;
+      if (utf8_message.convert(msg) == ERROR_SUCCESS) {
+        format_to(buffer_appender<char>(out), "{}: {}", message, utf8_message);
+        return;
       }
-      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-        break;  // Can't get error message, report error code instead.
-      buf.resize(buf.size() * 2);
     }
   }
   FMT_CATCH(...) {}
   format_error_code(out, error_code, message);
 }
 
-void report_windows_error(int error_code,
-                          fmt::string_view message) FMT_NOEXCEPT {
+void report_windows_error(int error_code, const char* message) FMT_NOEXCEPT {
   report_error(detail::format_windows_error, error_code, message);
 }
 #endif  // _WIN32
@@ -176,7 +212,10 @@ int buffered_file::fileno() const {
 
 #if FMT_USE_FCNTL
 file::file(cstring_view path, int oflag) {
-  int mode = S_IRUSR | S_IWUSR;
+#  ifdef _WIN32
+  using mode_t = int;
+#  endif
+  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
 #  if defined(_WIN32) && !defined(__MINGW32__)
   fd_ = -1;
   FMT_POSIX_CALL(sopen_s(&fd_, path.c_str(), oflag, _SH_DENYNO, mode));
@@ -230,14 +269,14 @@ long long file::size() const {
 }
 
 std::size_t file::read(void* buffer, std::size_t count) {
-  RWResult result = 0;
+  rwresult result = 0;
   FMT_RETRY(result, FMT_POSIX_CALL(read(fd_, buffer, convert_rwcount(count))));
   if (result < 0) FMT_THROW(system_error(errno, "cannot read from file"));
   return detail::to_unsigned(result);
 }
 
 std::size_t file::write(const void* buffer, std::size_t count) {
-  RWResult result = 0;
+  rwresult result = 0;
   FMT_RETRY(result, FMT_POSIX_CALL(write(fd_, buffer, convert_rwcount(count))));
   if (result < 0) FMT_THROW(system_error(errno, "cannot write to file"));
   return detail::to_unsigned(result);
@@ -261,10 +300,10 @@ void file::dup2(int fd) {
   }
 }
 
-void file::dup2(int fd, error_code& ec) FMT_NOEXCEPT {
+void file::dup2(int fd, std::error_code& ec) FMT_NOEXCEPT {
   int result = 0;
   FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
-  if (result == -1) ec = error_code(errno);
+  if (result == -1) ec = std::error_code(errno, std::generic_category());
 }
 
 void file::pipe(file& read_end, file& write_end) {
diff --git a/src/info.cpp b/src/info.cpp
index 7e8dbd37b9..297633cd9d 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -1059,8 +1059,9 @@ static void print_columns(FILE *fp, std::map<std::string, ValueType> *styles)
   for (typename std::map<std::string, ValueType>::iterator it = styles->begin(); it != styles->end(); ++it) {
     const std::string &style_name = it->first;
 
-    // skip "secret" styles
-    if (isupper(style_name[0])) continue;
+    // skip "internal" styles
+    if (isupper(style_name[0]) || utils::strmatch(style_name,"/kk/host$")
+        || utils::strmatch(style_name,"/kk/device$")) continue;
 
     int len = style_name.length();
     if (pos + len > 80) {
diff --git a/src/input.cpp b/src/input.cpp
index 08f62bdb42..30424ad5cb 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -27,9 +27,11 @@
 #include "dihedral.h"
 #include "domain.h"
 #include "error.h"
+#include "fix.h"
 #include "force.h"
 #include "group.h"
 #include "improper.h"
+#include "integrate.h"
 #include "kspace.h"
 #include "memory.h"
 #include "min.h"
@@ -1836,8 +1838,25 @@ void Input::timer_command()
 void Input::timestep()
 {
   if (narg != 1) error->all(FLERR,"Illegal timestep command");
+
+  update->update_time();
   update->dt = utils::numeric(FLERR,arg[0],false,lmp);
   update->dt_default = 0;
+
+  // timestep command can be invoked between runs or by run every
+  // calls to other classes that need to know timestep size changed
+  // similar logic is in FixDtReset::end_of_step()
+  // only need to do this if a run has already occurred
+
+  if (update->first_update == 0) return;
+
+  int respaflag = 0;
+  if (utils::strmatch(update->integrate_style, "^respa")) respaflag = 1;
+  if (respaflag) update->integrate->reset_dt();
+
+  if (force->pair) force->pair->reset_dt();
+  for (int i = 0; i < modify->nfix; i++) modify->fix[i]->reset_dt();
+  output->reset_dt();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/integrate.cpp b/src/integrate.cpp
index 52d34fb943..256291ed3b 100644
--- a/src/integrate.cpp
+++ b/src/integrate.cpp
@@ -20,6 +20,7 @@
 #include "kspace.h"
 #include "modify.h"
 #include "pair.h"
+#include "output.h"
 #include "update.h"
 
 using namespace LAMMPS_NS;
@@ -115,7 +116,12 @@ void Integrate::ev_setup()
 
 /* ----------------------------------------------------------------------
    set eflag,vflag for current iteration
-   based on computes that need energy/virial info on this timestep
+   based on
+     (1) computes that need energy/virial info on this timestep
+     (2) time dumps that may need per-atom compute info on this timestep
+     NOTE: inefficient to add all per-atom eng/virial computes
+             but don't know which ones the dump needs
+           see NOTE in output.cpp
    invoke matchstep() on all timestep-dependent computes to clear their arrays
    eflag: set any or no bits
      ENERGY_GLOBAL bit for global energy
@@ -133,6 +139,10 @@ void Integrate::ev_set(bigint ntimestep)
 {
   int i,flag;
 
+  int tdflag = 0;
+  if (output->any_time_dumps &&
+      output->next_time_dump_any == ntimestep) tdflag = 1;
+
   flag = 0;
   int eflag_global = 0;
   for (i = 0; i < nelist_global; i++)
@@ -143,7 +153,7 @@ void Integrate::ev_set(bigint ntimestep)
   int eflag_atom = 0;
   for (i = 0; i < nelist_atom; i++)
     if (elist_atom[i]->matchstep(ntimestep)) flag = 1;
-  if (flag) eflag_atom = ENERGY_ATOM;
+  if (flag || (tdflag && nelist_atom)) eflag_atom = ENERGY_ATOM;
 
   if (eflag_global) update->eflag_global = ntimestep;
   if (eflag_atom) update->eflag_atom = ntimestep;
@@ -159,13 +169,13 @@ void Integrate::ev_set(bigint ntimestep)
   int vflag_atom = 0;
   for (i = 0; i < nvlist_atom; i++)
     if (vlist_atom[i]->matchstep(ntimestep)) flag = 1;
-  if (flag) vflag_atom = VIRIAL_ATOM;
+  if (flag || (tdflag && nvlist_atom)) vflag_atom = VIRIAL_ATOM;
 
   flag = 0;
   int cvflag_atom = 0;
   for (i = 0; i < ncvlist_atom; i++)
     if (cvlist_atom[i]->matchstep(ntimestep)) flag = 1;
-  if (flag) cvflag_atom = VIRIAL_CENTROID;
+  if (flag || (tdflag && ncvlist_atom)) cvflag_atom = VIRIAL_CENTROID;
 
   if (vflag_global) update->vflag_global = ntimestep;
   if (vflag_atom || cvflag_atom) update->vflag_atom = ntimestep;
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 0829c4fdbd..55b7755c83 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -1305,12 +1305,14 @@ void _noopt LAMMPS::help()
 
 /* ----------------------------------------------------------------------
    print style names in columns
-   skip any style that starts with upper-case letter, since internal
+   skip any internal style that starts with an upper-case letter
+   also skip "redundant" KOKKOS styles ending in kk/host or kk/device
 ------------------------------------------------------------------------- */
 
 void print_style(FILE *fp, const char *str, int &pos)
 {
-  if (isupper(str[0])) return;
+  if (isupper(str[0]) || utils::strmatch(str,"/kk/host$")
+      || utils::strmatch(str,"/kk/device$")) return;
 
   int len = strlen(str);
   if (pos+len > 80) {
diff --git a/src/min_linesearch.cpp b/src/min_linesearch.cpp
index c4c10d8a1f..2e7066daed 100644
--- a/src/min_linesearch.cpp
+++ b/src/min_linesearch.cpp
@@ -328,7 +328,7 @@ int MinLineSearch::linemin_quadratic(double eoriginal, double &alpha)
   int i,m,n;
   double fdothall,fdothme,hme,hmax,hmaxall;
   double de_ideal,de;
-  double delfh,engprev,relerr,alphaprev,fhprev,ff,fh,alpha0;
+  double delfh,engprev,relerr,alphaprev,fhprev,fh,alpha0;
   double dot[2],dotall[2];
   double *xatom,*x0atom,*fatom,*hatom;
   double alphamax;
@@ -439,12 +439,8 @@ int MinLineSearch::linemin_quadratic(double eoriginal, double &alpha)
         dotall[1] += fextra[i]*hextra[i];
       }
     }
-    ff = dotall[0];
     fh = dotall[1];
-    if (output->thermo->normflag) {
-      ff /= atom->natoms;
-      fh /= atom->natoms;
-    }
+    if (output->thermo->normflag) fh /= atom->natoms;
 
     delfh = fh - fhprev;
 
diff --git a/src/output.cpp b/src/output.cpp
index 9e29caf118..a8fda4173d 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -12,6 +12,10 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+/* ----------------------------------------------------------------------
+   Contributing author: Michal Kanski (Jagiellonian U) for simulation time dumps
+------------------------------------------------------------------------- */
+
 #include "output.h"
 #include "style_dump.h"         // IWYU pragma: keep
 
@@ -29,11 +33,13 @@
 #include "variable.h"
 #include "write_restart.h"
 
+#include <cmath>
 #include <cstring>
 
 using namespace LAMMPS_NS;
 
 #define DELTA 1
+#define EPSDT 1.0e-6
 
 /* ----------------------------------------------------------------------
    initialize all output
@@ -59,8 +65,11 @@ Output::Output(LAMMPS *lmp) : Pointers(lmp)
 
   ndump = 0;
   max_dump = 0;
+  mode_dump = nullptr;
   every_dump = nullptr;
+  every_time_dump = nullptr;
   next_dump = nullptr;
+  next_time_dump = nullptr;
   last_dump = nullptr;
   var_dump = nullptr;
   ivar_dump = nullptr;
@@ -92,8 +101,11 @@ Output::~Output()
   if (thermo) delete thermo;
   delete [] var_thermo;
 
+  memory->destroy(mode_dump);
   memory->destroy(every_dump);
+  memory->destroy(every_time_dump);
   memory->destroy(next_dump);
+  memory->destroy(next_time_dump);
   memory->destroy(last_dump);
   for (int i = 0; i < ndump; i++) delete [] var_dump[i];
   memory->sfree(var_dump);
@@ -125,14 +137,18 @@ void Output::init()
   }
 
   for (int i = 0; i < ndump; i++) dump[i]->init();
-  for (int i = 0; i < ndump; i++)
-    if (every_dump[i] == 0) {
+  any_time_dumps = 0;
+  for (int i = 0; i < ndump; i++) {
+    if (mode_dump[i]) any_time_dumps = 1;
+    if ((mode_dump[i] == 0 && every_dump[i] == 0) ||
+        (mode_dump[i] == 1 && every_time_dump[i] == 0.0)) {
       ivar_dump[i] = input->variable->find(var_dump[i]);
       if (ivar_dump[i] < 0)
-        error->all(FLERR,"Variable name for dump every does not exist");
+        error->all(FLERR,"Variable name for dump every or delta does not exist");
       if (!input->variable->equalstyle(ivar_dump[i]))
-        error->all(FLERR,"Variable for dump every is invalid style");
+        error->all(FLERR,"Variable for dump every or delta is invalid style");
     }
+  }
 
   if (restart_flag_single && restart_every_single == 0) {
     ivar_restart_single = input->variable->find(var_restart_single);
@@ -161,368 +177,545 @@ void Output::setup(int memflag)
 {
   bigint ntimestep = update->ntimestep;
 
-  // perform dump at start of run only if:
-  //   current timestep is multiple of every and last dump not >= this step
-  //   this is first run after dump created and firstflag is set
-  //   note that variable freq will not write unless triggered by firstflag
-  // set next_dump to multiple of every or variable value
-  // set next_dump_any to smallest next_dump
-  // wrap dumps that invoke computes and variable eval with clear/add
-  // if dump not written now, use addstep_compute_all() since don't know
-  //   what computes the dump write would invoke
-  // if no dumps, set next_dump_any to last+1 so will not influence next
-
-  int writeflag;
+  // consider all dumps
+  // decide whether to write snapshot and/or calculate next step for dump
 
   if (ndump && update->restrict_output == 0) {
+    next_time_dump_any = MAXBIGINT;
+
     for (int idump = 0; idump < ndump; idump++) {
-      if (dump[idump]->clearstep || every_dump[idump] == 0)
+
+      // wrap step dumps that invoke computes or do variable eval with clear/add
+      // see NOTE in write() about also wrapping time dumps
+
+      if (mode_dump[idump] == 0 &&
+          (dump[idump]->clearstep || var_dump[idump]))
         modify->clearstep_compute();
-      writeflag = 0;
-      if (every_dump[idump] && ntimestep % every_dump[idump] == 0 &&
-          last_dump[idump] != ntimestep) writeflag = 1;
+
+      // write a snapshot at setup only if any of these 3 conditions hold
+      // (1) this is first run since dump was created and its first_flag = 0
+      // (2) mode_dump = 0 and timestep is multiple of every_dump
+      // (3) mode_dump = 1 and time is multiple of every_time_dump (within EPSDT)
+      // (2) and (3) only apply for non-variable dump intervals
+      // finally, do not write if same snapshot written previously,
+      //   i.e. on last timestep of previous run
+
+      int writeflag = 0;
+
       if (last_dump[idump] < 0 && dump[idump]->first_flag == 1) writeflag = 1;
 
+      if (mode_dump[idump] == 0) {
+        if (every_dump[idump] && (ntimestep % every_dump[idump] == 0))
+          writeflag = 1;
+      } else {
+        if (every_time_dump[idump] > 0.0) {
+          double tcurrent = update->atime +
+            (ntimestep - update->atimestep) * update->dt;
+          double remainder = fmod(tcurrent,every_time_dump[idump]);
+          if ((remainder < EPSDT*update->dt) ||
+              (every_time_dump[idump] - remainder < EPSDT*update->dt))
+            writeflag = 1;
+        }
+      }
+
+      if (last_dump[idump] == ntimestep) writeflag = 0;
+
+      // perform dump
+
       if (writeflag) {
         dump[idump]->write();
         last_dump[idump] = ntimestep;
       }
-      if (every_dump[idump])
-        next_dump[idump] =
-          (ntimestep/every_dump[idump])*every_dump[idump] + every_dump[idump];
-      else {
-        bigint nextdump = static_cast<bigint>
-          (input->variable->compute_equal(ivar_dump[idump]));
-        if (nextdump <= ntimestep)
-          error->all(FLERR,"Dump every variable returned a bad timestep");
-        next_dump[idump] = nextdump;
-      }
-      if (dump[idump]->clearstep || every_dump[idump] == 0) {
+
+      // calculate timestep and/or time for next dump
+      // set next_dump and next_time_dump, 0 arg for setup()
+      // only do this if dump written or dump has not been written yet
+
+      if (writeflag || last_dump[idump] < 0)
+        calculate_next_dump(0,idump,ntimestep);
+
+      // if dump not written now, use addstep_compute_all()
+      // since don't know what computes the dump will invoke
+
+      if (mode_dump[idump] == 0 &&
+          (dump[idump]->clearstep || var_dump[idump])) {
         if (writeflag) modify->addstep_compute(next_dump[idump]);
         else modify->addstep_compute_all(next_dump[idump]);
       }
+
+      if (mode_dump[idump] && (dump[idump]->clearstep || var_dump[idump]))
+        next_time_dump_any = MIN(next_time_dump_any,next_dump[idump]);
       if (idump) next_dump_any = MIN(next_dump_any,next_dump[idump]);
       else next_dump_any = next_dump[0];
     }
+
+  // if no dumps, set next_dump_any to last+1 so will not influence next
+
   } else next_dump_any = update->laststep + 1;
 
-  // do not write restart files at start of run
-  // set next_restart values to multiple of every or variable value
-  // wrap variable eval with clear/add
-  // if no restarts, set next_restart to last+1 so will not influence next
+   // do not write restart files at start of run
+   // set next_restart values to multiple of every or variable value
+   // wrap variable eval with clear/add
+   // if no restarts, set next_restart to last+1 so will not influence next
 
-  if (restart_flag && update->restrict_output == 0) {
-    if (restart_flag_single) {
-      if (restart_every_single)
-        next_restart_single =
-          (ntimestep/restart_every_single)*restart_every_single +
-          restart_every_single;
-      else {
-        bigint nextrestart = static_cast<bigint>
-          (input->variable->compute_equal(ivar_restart_single));
-        if (nextrestart <= ntimestep)
-          error->all(FLERR,"Restart variable returned a bad timestep");
-        next_restart_single = nextrestart;
-      }
-    } else next_restart_single = update->laststep + 1;
-    if (restart_flag_double) {
-      if (restart_every_double)
-        next_restart_double =
-          (ntimestep/restart_every_double)*restart_every_double +
-          restart_every_double;
-      else {
-        bigint nextrestart = static_cast<bigint>
-          (input->variable->compute_equal(ivar_restart_double));
-        if (nextrestart <= ntimestep)
-          error->all(FLERR,"Restart variable returned a bad timestep");
-        next_restart_double = nextrestart;
-      }
-    } else next_restart_double = update->laststep + 1;
-    next_restart = MIN(next_restart_single,next_restart_double);
-  } else next_restart = update->laststep + 1;
+   if (restart_flag && update->restrict_output == 0) {
+     if (restart_flag_single) {
+       if (restart_every_single)
+         next_restart_single =
+           (ntimestep/restart_every_single)*restart_every_single +
+           restart_every_single;
+       else {
+         bigint nextrestart = static_cast<bigint>
+           (input->variable->compute_equal(ivar_restart_single));
+         if (nextrestart <= ntimestep)
+           error->all(FLERR,"Restart variable returned a bad timestep");
+         next_restart_single = nextrestart;
+       }
+     } else next_restart_single = update->laststep + 1;
+     if (restart_flag_double) {
+       if (restart_every_double)
+         next_restart_double =
+           (ntimestep/restart_every_double)*restart_every_double +
+           restart_every_double;
+       else {
+         bigint nextrestart = static_cast<bigint>
+           (input->variable->compute_equal(ivar_restart_double));
+         if (nextrestart <= ntimestep)
+           error->all(FLERR,"Restart variable returned a bad timestep");
+         next_restart_double = nextrestart;
+       }
+     } else next_restart_double = update->laststep + 1;
+     next_restart = MIN(next_restart_single,next_restart_double);
+   } else next_restart = update->laststep + 1;
 
-  // print memory usage unless being called between multiple runs
+   // print memory usage unless being called between multiple runs
 
-  if (memflag) memory_usage();
+   if (memflag) memory_usage();
 
-  // set next_thermo to multiple of every or variable eval if var defined
-  // insure thermo output on last step of run
-  // thermo may invoke computes so wrap with clear/add
+   // set next_thermo to multiple of every or variable eval if var defined
+   // insure thermo output on last step of run
+   // thermo may invoke computes so wrap with clear/add
 
-  modify->clearstep_compute();
+   modify->clearstep_compute();
 
-  thermo->header();
-  thermo->compute(0);
-  last_thermo = ntimestep;
+   thermo->header();
+   thermo->compute(0);
+   last_thermo = ntimestep;
 
-  if (var_thermo) {
-    next_thermo = static_cast<bigint>
-      (input->variable->compute_equal(ivar_thermo));
-    if (next_thermo <= ntimestep)
-      error->all(FLERR,"Thermo every variable returned a bad timestep");
-  } else if (thermo_every) {
-    next_thermo = (ntimestep/thermo_every)*thermo_every + thermo_every;
-    next_thermo = MIN(next_thermo,update->laststep);
-  } else next_thermo = update->laststep;
+   if (var_thermo) {
+     next_thermo = static_cast<bigint>
+       (input->variable->compute_equal(ivar_thermo));
+     if (next_thermo <= ntimestep)
+       error->all(FLERR,"Thermo every variable returned a bad timestep");
+   } else if (thermo_every) {
+     next_thermo = (ntimestep/thermo_every)*thermo_every + thermo_every;
+     next_thermo = MIN(next_thermo,update->laststep);
+   } else next_thermo = update->laststep;
 
-  modify->addstep_compute(next_thermo);
+   modify->addstep_compute(next_thermo);
 
-  // next = next timestep any output will be done
+   // next = next timestep any output will be done
 
-  next = MIN(next_dump_any,next_restart);
-  next = MIN(next,next_thermo);
+   next = MIN(next_dump_any,next_restart);
+   next = MIN(next,next_thermo);
+ }
+
+ /* ----------------------------------------------------------------------
+    perform all output for this timestep
+    only perform output if next matches current step and last output doesn't
+    do dump/restart before thermo so thermo CPU time will include them
+ ------------------------------------------------------------------------- */
+
+ void Output::write(bigint ntimestep)
+ {
+   // perform dump if its next_dump = current ntimestep
+   //   but not if it was already written on this step
+   // set next_dump and also next_time_dump for mode_dump = 1
+   // set next_dump_any to smallest next_dump
+   // wrap step dumps that invoke computes or do variable eval with clear/add
+   // NOTE:
+   //   not wrapping time dumps means that Integrate::ev_set()
+   //     needs to trigger all per-atom eng/virial computes
+   //     on a timestep where any time dump will be output
+   //   could wrap time dumps as well, if timestep size did not vary
+   //   if wrap when timestep size varies frequently,
+   //     then can do many unneeded addstep() --> inefficient
+   //   hard to know if timestep varies, since run every could change it
+   //   can't remove an uneeded addstep from a compute, b/c don't know
+   //     what other command may have added it
+
+   if (next_dump_any == ntimestep) {
+
+     for (int idump = 0; idump < ndump; idump++) {
+       next_time_dump_any = MAXBIGINT;
+
+       if (next_dump[idump] == ntimestep) {
+         if (last_dump[idump] == ntimestep) continue;
+
+         if (mode_dump[idump] == 0 &&
+             (dump[idump]->clearstep || var_dump[idump]))
+           modify->clearstep_compute();
+
+         // perform dump
+         // reset next_dump and next_time_dump, 1 arg for write()
+
+         dump[idump]->write();
+         last_dump[idump] = ntimestep;
+         calculate_next_dump(1,idump,ntimestep);
+
+         if (mode_dump[idump] == 0 &&
+             (dump[idump]->clearstep || var_dump[idump]))
+           modify->addstep_compute(next_dump[idump]);
+       }
+
+       if (mode_dump[idump] && (dump[idump]->clearstep || var_dump[idump]))
+         next_time_dump_any = MIN(next_time_dump_any,next_dump[idump]);
+       if (idump) next_dump_any = MIN(next_dump_any,next_dump[idump]);
+       else next_dump_any = next_dump[0];
+     }
+   }
+
+   // next_restart does not force output on last step of run
+   // for toggle = 0, replace "*" with current timestep in restart filename
+   // next restart variable may invoke computes so wrap with clear/add
+
+   if (next_restart == ntimestep) {
+     if (next_restart_single == ntimestep) {
+
+       std::string file = restart1;
+       std::size_t found = file.find('*');
+       if (found != std::string::npos)
+         file.replace(found,1,fmt::format("{}",update->ntimestep));
+
+       if (last_restart != ntimestep) restart->write(file);
+
+       if (restart_every_single) next_restart_single += restart_every_single;
+       else {
+         modify->clearstep_compute();
+         bigint nextrestart = static_cast<bigint>
+           (input->variable->compute_equal(ivar_restart_single));
+         if (nextrestart <= ntimestep)
+           error->all(FLERR,"Restart variable returned a bad timestep");
+         next_restart_single = nextrestart;
+         modify->addstep_compute(next_restart_single);
+       }
+     }
+     if (next_restart_double == ntimestep) {
+       if (last_restart != ntimestep) {
+         if (restart_toggle == 0) {
+           restart->write(restart2a);
+           restart_toggle = 1;
+         } else {
+           restart->write(restart2b);
+           restart_toggle = 0;
+         }
+       }
+       if (restart_every_double) next_restart_double += restart_every_double;
+       else {
+         modify->clearstep_compute();
+         bigint nextrestart = static_cast<bigint>
+           (input->variable->compute_equal(ivar_restart_double));
+         if (nextrestart <= ntimestep)
+           error->all(FLERR,"Restart variable returned a bad timestep");
+         next_restart_double = nextrestart;
+         modify->addstep_compute(next_restart_double);
+       }
+     }
+     last_restart = ntimestep;
+     next_restart = MIN(next_restart_single,next_restart_double);
+   }
+
+   // insure next_thermo forces output on last step of run
+   // thermo may invoke computes so wrap with clear/add
+
+   if (next_thermo == ntimestep) {
+     modify->clearstep_compute();
+     if (last_thermo != ntimestep) thermo->compute(1);
+     last_thermo = ntimestep;
+     if (var_thermo) {
+       next_thermo = static_cast<bigint>
+         (input->variable->compute_equal(ivar_thermo));
+       if (next_thermo <= ntimestep)
+         error->all(FLERR,"Thermo every variable returned a bad timestep");
+     } else if (thermo_every) next_thermo += thermo_every;
+     else next_thermo = update->laststep;
+     next_thermo = MIN(next_thermo,update->laststep);
+     modify->addstep_compute(next_thermo);
+   }
+
+   // next = next timestep any output will be done
+
+   next = MIN(next_dump_any,next_restart);
+   next = MIN(next,next_thermo);
+ }
+
+ /* ----------------------------------------------------------------------
+    force a snapshot to be written for all dumps
+    called from PRD and TAD
+ ------------------------------------------------------------------------- */
+
+ void Output::write_dump(bigint ntimestep)
+ {
+   for (int idump = 0; idump < ndump; idump++) {
+     dump[idump]->write();
+     last_dump[idump] = ntimestep;
+   }
+ }
+
+ /* ----------------------------------------------------------------------
+    calculate when next dump occurs for Dump instance idump
+    operates in one of two modes, based on mode_dump flag
+    for timestep mode, set next_dump
+    for simulation time mode, set next_time_dump and next_dump
+    which flag depends on caller
+      0 = from setup() at start of run
+      1 = from write() during run each time a dump file is written
+      2 = from reset_dt() called from fix dt/reset when it changes timestep size
+ ------------------------------------------------------------------------- */
+
+  void Output::calculate_next_dump(int which, int idump, bigint ntimestep)
+ {
+   // dump mode is by timestep
+   // just set next_dump
+
+   if (mode_dump[idump] == 0) {
+
+     if (every_dump[idump]) {
+
+       // which = 0: nextdump = next multiple of every_dump
+       // which = 1: increment nextdump by every_dump
+
+       if (which == 0)
+         next_dump[idump] =
+           (ntimestep/every_dump[idump])*every_dump[idump] + every_dump[idump];
+       else if (which == 1)
+         next_dump[idump] += every_dump[idump];
+
+     } else {
+       next_dump[idump] = static_cast<bigint>
+         (input->variable->compute_equal(ivar_dump[idump]));
+       if (next_dump[idump] <= ntimestep)
+         error->all(FLERR,"Dump every variable returned a bad timestep");
+     }
+
+   // dump mode is by simulation time
+   // set next_time_dump and next_dump
+
+   } else {
+
+     bigint nextdump;
+     double nexttime;
+     double tcurrent = update->atime +
+       (ntimestep - update->atimestep) * update->dt;
+
+     if (every_time_dump[idump] > 0.0) {
+
+       // which = 0: nexttime = next multiple of every_time_dump
+       // which = 1: increment nexttime by every_time_dump
+       // which = 2: no change to previous nexttime (only timestep has changed)
+
+       if (which == 0)
+         nexttime = static_cast<bigint> (tcurrent/every_time_dump[idump]) *
+           every_time_dump[idump] + every_time_dump[idump];
+       else if (which == 1)
+         nexttime = next_time_dump[idump] + every_time_dump[idump];
+       else if (which == 2)
+         nexttime = next_time_dump[idump];
+
+       nextdump = ntimestep +
+         static_cast<bigint> ((nexttime - tcurrent - EPSDT*update->dt) /
+                              update->dt) + 1;
+
+       // if delta is too small to reach next timestep, use multiple of delta
+
+       if (nextdump == ntimestep) {
+         double tnext = update->atime +
+           (ntimestep+1 - update->atimestep) * update->dt;
+         int multiple = static_cast<int>
+           ((tnext - nexttime) / every_time_dump[idump]);
+         nexttime = nexttime + (multiple+1)*every_time_dump[idump];
+         nextdump = ntimestep +
+           static_cast<bigint> ((nexttime - tcurrent - EPSDT*update->dt) /
+                                update->dt) + 1;
+       }
+
+     } else {
+
+       // do not re-evaulate variable for which = 2, leave nexttime as-is
+       // unless next_time_dump < 0.0, which means variable never yet evaluated
+
+       if (which < 2 || next_time_dump[idump] < 0.0) {
+         nexttime = input->variable->compute_equal(ivar_dump[idump]);
+       } else
+         nexttime = next_time_dump[idump];
+
+       if (nexttime <= tcurrent)
+         error->all(FLERR,"Dump every/time variable returned a bad time");
+
+       nextdump = ntimestep +
+         static_cast<bigint> ((nexttime - tcurrent - EPSDT*update->dt) /
+                              update->dt) + 1;
+       if (nextdump <= ntimestep)
+         error->all(FLERR,"Dump every/time variable too small for next timestep");
+     }
+
+     next_time_dump[idump] = nexttime;
+     next_dump[idump] = nextdump;
+   }
+ }
+
+/* ---------------------------------------------------------------------- */
+
+int Output::check_time_dumps(bigint ntimestep)
+{
+  int nowflag = 0;
+  for (int i = 0; i < ndump; i++)
+    if (mode_dump[i] && next_dump[i] == ntimestep) nowflag = 1;
+
+  return nowflag;
 }
 
-/* ----------------------------------------------------------------------
-   perform all output for this timestep
-   only perform output if next matches current step and last output doesn't
-   do dump/restart before thermo so thermo CPU time will include them
-------------------------------------------------------------------------- */
+ /* ----------------------------------------------------------------------
+    force restart file(s) to be written
+    called from PRD and TAD
+ ------------------------------------------------------------------------- */
 
-void Output::write(bigint ntimestep)
-{
-  // next_dump does not force output on last step of run
-  // wrap dumps that invoke computes or eval of variable with clear/add
+ void Output::write_restart(bigint ntimestep)
+ {
+   if (restart_flag_single) {
+     std::string file = restart1;
+     std::size_t found = file.find('*');
+     if (found != std::string::npos)
+       file.replace(found,1,fmt::format("{}",update->ntimestep));
+     restart->write(file);
+   }
 
-  if (next_dump_any == ntimestep) {
-    for (int idump = 0; idump < ndump; idump++) {
-      if (next_dump[idump] == ntimestep) {
-        if (dump[idump]->clearstep || every_dump[idump] == 0)
-          modify->clearstep_compute();
-        if (last_dump[idump] != ntimestep) {
-          dump[idump]->write();
-          last_dump[idump] = ntimestep;
-        }
-        if (every_dump[idump]) next_dump[idump] += every_dump[idump];
-        else {
-          bigint nextdump = static_cast<bigint>
-            (input->variable->compute_equal(ivar_dump[idump]));
-          if (nextdump <= ntimestep)
-            error->all(FLERR,"Dump every variable returned a bad timestep");
-          next_dump[idump] = nextdump;
-        }
-        if (dump[idump]->clearstep || every_dump[idump] == 0)
-          modify->addstep_compute(next_dump[idump]);
-      }
-      if (idump) next_dump_any = MIN(next_dump_any,next_dump[idump]);
-      else next_dump_any = next_dump[0];
-    }
-  }
+   if (restart_flag_double) {
+     if (restart_toggle == 0) {
+       restart->write(restart2a);
+       restart_toggle = 1;
+     } else {
+       restart->write(restart2b);
+       restart_toggle = 0;
+     }
+   }
 
-  // next_restart does not force output on last step of run
-  // for toggle = 0, replace "*" with current timestep in restart filename
-  // eval of variable may invoke computes so wrap with clear/add
+   last_restart = ntimestep;
+ }
 
-  if (next_restart == ntimestep) {
-    if (next_restart_single == ntimestep) {
+ /* ----------------------------------------------------------------------
+    timestep is being changed, called by update->reset_timestep()
+    for dumps, require that no dump is "active"
+      meaning that a snapshot has already been output
+    reset next output values for restart and thermo
+    reset to smallest value >= new timestep
+    if next timestep set by variable evaluation,
+      eval for ntimestep-1, so current ntimestep can be returned if needed
+      no guarantee that variable can be evaluated for ntimestep-1
+      e.g. if it depends on computes, but live with that rare case for now
+ ------------------------------------------------------------------------- */
 
-      std::string file = restart1;
-      std::size_t found = file.find('*');
-      if (found != std::string::npos)
-        file.replace(found,1,fmt::format("{}",update->ntimestep));
+ void Output::reset_timestep(bigint ntimestep)
+ {
+   next_dump_any = MAXBIGINT;
+   for (int idump = 0; idump < ndump; idump++)
+     if (last_dump[idump] >= 0)
+       error->all(FLERR,
+                  "Cannot reset timestep with active dump - must undump first");
 
-      if (last_restart != ntimestep) restart->write(file);
+   if (restart_flag_single) {
+     if (restart_every_single) {
+       next_restart_single =
+         (ntimestep/restart_every_single)*restart_every_single;
+       if (next_restart_single < ntimestep)
+         next_restart_single += restart_every_single;
+     } else {
+       modify->clearstep_compute();
+       update->ntimestep--;
+       bigint nextrestart = static_cast<bigint>
+         (input->variable->compute_equal(ivar_restart_single));
+       if (nextrestart < ntimestep)
+         error->all(FLERR,"Restart variable returned a bad timestep");
+       update->ntimestep++;
+       next_restart_single = nextrestart;
+       modify->addstep_compute(next_restart_single);
+     }
+   } else next_restart_single = update->laststep + 1;
 
-      if (restart_every_single) next_restart_single += restart_every_single;
-      else {
-        modify->clearstep_compute();
-        bigint nextrestart = static_cast<bigint>
-          (input->variable->compute_equal(ivar_restart_single));
-        if (nextrestart <= ntimestep)
-          error->all(FLERR,"Restart variable returned a bad timestep");
-        next_restart_single = nextrestart;
-        modify->addstep_compute(next_restart_single);
-      }
-    }
-    if (next_restart_double == ntimestep) {
-      if (last_restart != ntimestep) {
-        if (restart_toggle == 0) {
-          restart->write(restart2a);
-          restart_toggle = 1;
-        } else {
-          restart->write(restart2b);
-          restart_toggle = 0;
-        }
-      }
-      if (restart_every_double) next_restart_double += restart_every_double;
-      else {
-        modify->clearstep_compute();
-        bigint nextrestart = static_cast<bigint>
-          (input->variable->compute_equal(ivar_restart_double));
-        if (nextrestart <= ntimestep)
-          error->all(FLERR,"Restart variable returned a bad timestep");
-        next_restart_double = nextrestart;
-        modify->addstep_compute(next_restart_double);
-      }
-    }
-    last_restart = ntimestep;
-    next_restart = MIN(next_restart_single,next_restart_double);
-  }
+   if (restart_flag_double) {
+     if (restart_every_double) {
+       next_restart_double =
+         (ntimestep/restart_every_double)*restart_every_double;
+       if (next_restart_double < ntimestep)
+         next_restart_double += restart_every_double;
+     } else {
+       modify->clearstep_compute();
+       update->ntimestep--;
+       bigint nextrestart = static_cast<bigint>
+         (input->variable->compute_equal(ivar_restart_double));
+       if (nextrestart < ntimestep)
+         error->all(FLERR,"Restart variable returned a bad timestep");
+       update->ntimestep++;
+       next_restart_double = nextrestart;
+       modify->addstep_compute(next_restart_double);
+     }
+   } else next_restart_double = update->laststep + 1;
 
-  // insure next_thermo forces output on last step of run
-  // thermo may invoke computes so wrap with clear/add
+   next_restart = MIN(next_restart_single,next_restart_double);
 
-  if (next_thermo == ntimestep) {
-    modify->clearstep_compute();
-    if (last_thermo != ntimestep) thermo->compute(1);
-    last_thermo = ntimestep;
-    if (var_thermo) {
-      next_thermo = static_cast<bigint>
-        (input->variable->compute_equal(ivar_thermo));
-      if (next_thermo <= ntimestep)
-        error->all(FLERR,"Thermo every variable returned a bad timestep");
-    } else if (thermo_every) next_thermo += thermo_every;
-    else next_thermo = update->laststep;
-    next_thermo = MIN(next_thermo,update->laststep);
-    modify->addstep_compute(next_thermo);
-  }
+   if (var_thermo) {
+     modify->clearstep_compute();
+     update->ntimestep--;
+     next_thermo = static_cast<bigint>
+       (input->variable->compute_equal(ivar_thermo));
+     if (next_thermo < ntimestep)
+       error->all(FLERR,"Thermo_modify every variable returned a bad timestep");
+     update->ntimestep++;
+     next_thermo = MIN(next_thermo,update->laststep);
+     modify->addstep_compute(next_thermo);
+   } else if (thermo_every) {
+     next_thermo = (ntimestep/thermo_every)*thermo_every;
+     if (next_thermo < ntimestep) next_thermo += thermo_every;
+     next_thermo = MIN(next_thermo,update->laststep);
+   } else next_thermo = update->laststep;
 
-  // next = next timestep any output will be done
-
-  next = MIN(next_dump_any,next_restart);
-  next = MIN(next,next_thermo);
-}
+   next = MIN(next_dump_any,next_restart);
+   next = MIN(next,next_thermo);
+ }
 
 /* ----------------------------------------------------------------------
-   force a snapshot to be written for all dumps
-   called from PRD and TAD
+   timestep size is being changed
+   reset next output values for dumps which have mode_dump=1
+   called by fix dt/reset (at end of step)
+   or called by timestep command via run every (also at end of step)
 ------------------------------------------------------------------------- */
 
-void Output::write_dump(bigint ntimestep)
+void Output::reset_dt()
 {
+  bigint ntimestep = update->ntimestep;
+
+  next_time_dump_any = MAXBIGINT;
+
   for (int idump = 0; idump < ndump; idump++) {
-    dump[idump]->write();
-    last_dump[idump] = ntimestep;
-  }
-}
+    if (mode_dump[idump] == 0) continue;
 
-/* ----------------------------------------------------------------------
-   force restart file(s) to be written
-   called from PRD and TAD
-------------------------------------------------------------------------- */
+    // reset next_dump but do not change next_time_dump, 2 arg for reset_dt()
+    // do not invoke for a dump already scheduled for this step
+    //   since timestep change affects next step
 
-void Output::write_restart(bigint ntimestep)
-{
-  if (restart_flag_single) {
-    std::string file = restart1;
-    std::size_t found = file.find('*');
-    if (found != std::string::npos)
-      file.replace(found,1,fmt::format("{}",update->ntimestep));
-    restart->write(file);
+    if (next_dump[idump] != ntimestep)
+      calculate_next_dump(2,idump,update->ntimestep);
+
+    if (dump[idump]->clearstep || var_dump[idump])
+      next_time_dump_any = MIN(next_time_dump_any,next_dump[idump]);
   }
 
-  if (restart_flag_double) {
-    if (restart_toggle == 0) {
-      restart->write(restart2a);
-      restart_toggle = 1;
-    } else {
-      restart->write(restart2b);
-      restart_toggle = 0;
-    }
-  }
-
-  last_restart = ntimestep;
-}
-
-/* ----------------------------------------------------------------------
-   timestep is being changed, called by update->reset_timestep()
-   reset next timestep values for dumps, restart, thermo output
-   reset to smallest value >= new timestep
-   if next timestep set by variable evaluation,
-     eval for ntimestep-1, so current ntimestep can be returned if needed
-     no guarantee that variable can be evaluated for ntimestep-1
-       if it depends on computes, but live with that rare case for now
-------------------------------------------------------------------------- */
-
-void Output::reset_timestep(bigint ntimestep)
-{
-  next_dump_any = MAXBIGINT;
-  for (int idump = 0; idump < ndump; idump++) {
-    if (every_dump[idump]) {
-      next_dump[idump] = (ntimestep/every_dump[idump])*every_dump[idump];
-      if (next_dump[idump] < ntimestep) next_dump[idump] += every_dump[idump];
-    } else {
-      // ivar_dump may not be initialized
-      if (ivar_dump[idump] < 0) {
-        ivar_dump[idump] = input->variable->find(var_dump[idump]);
-        if (ivar_dump[idump] < 0)
-          error->all(FLERR,"Variable name for dump every does not exist");
-        if (!input->variable->equalstyle(ivar_dump[idump]))
-          error->all(FLERR,"Variable for dump every is invalid style");
-      }
-      modify->clearstep_compute();
-      update->ntimestep--;
-      bigint nextdump = static_cast<bigint>
-        (input->variable->compute_equal(ivar_dump[idump]));
-      if (nextdump < ntimestep)
-        error->all(FLERR,"Dump every variable returned a bad timestep");
-      update->ntimestep++;
-      next_dump[idump] = nextdump;
-      modify->addstep_compute(next_dump[idump]);
-    }
-    next_dump_any = MIN(next_dump_any,next_dump[idump]);
-  }
-
-  if (restart_flag_single) {
-    if (restart_every_single) {
-      next_restart_single =
-        (ntimestep/restart_every_single)*restart_every_single;
-      if (next_restart_single < ntimestep)
-        next_restart_single += restart_every_single;
-    } else {
-      modify->clearstep_compute();
-      update->ntimestep--;
-      bigint nextrestart = static_cast<bigint>
-        (input->variable->compute_equal(ivar_restart_single));
-      if (nextrestart < ntimestep)
-        error->all(FLERR,"Restart variable returned a bad timestep");
-      update->ntimestep++;
-      next_restart_single = nextrestart;
-      modify->addstep_compute(next_restart_single);
-    }
-  } else next_restart_single = update->laststep + 1;
-
-  if (restart_flag_double) {
-    if (restart_every_double) {
-      next_restart_double =
-        (ntimestep/restart_every_double)*restart_every_double;
-      if (next_restart_double < ntimestep)
-        next_restart_double += restart_every_double;
-    } else {
-      modify->clearstep_compute();
-      update->ntimestep--;
-      bigint nextrestart = static_cast<bigint>
-        (input->variable->compute_equal(ivar_restart_double));
-      if (nextrestart < ntimestep)
-        error->all(FLERR,"Restart variable returned a bad timestep");
-      update->ntimestep++;
-      next_restart_double = nextrestart;
-      modify->addstep_compute(next_restart_double);
-    }
-  } else next_restart_double = update->laststep + 1;
-
-  next_restart = MIN(next_restart_single,next_restart_double);
-
-  if (var_thermo) {
-    modify->clearstep_compute();
-    update->ntimestep--;
-    next_thermo = static_cast<bigint>
-      (input->variable->compute_equal(ivar_thermo));
-    if (next_thermo < ntimestep)
-      error->all(FLERR,"Thermo_modify every variable returned a bad timestep");
-    update->ntimestep++;
-    next_thermo = MIN(next_thermo,update->laststep);
-    modify->addstep_compute(next_thermo);
-  } else if (thermo_every) {
-    next_thermo = (ntimestep/thermo_every)*thermo_every;
-    if (next_thermo < ntimestep) next_thermo += thermo_every;
-    next_thermo = MIN(next_thermo,update->laststep);
-  } else next_thermo = update->laststep;
-
+  next_dump_any = MIN(next_dump_any,next_time_dump_any);
   next = MIN(next_dump_any,next_restart);
   next = MIN(next,next_thermo);
 }
 
+
 /* ----------------------------------------------------------------------
    add a Dump to list of Dumps
 ------------------------------------------------------------------------- */
@@ -547,21 +740,17 @@ void Output::add_dump(int narg, char **arg)
     max_dump += DELTA;
     dump = (Dump **)
       memory->srealloc(dump,max_dump*sizeof(Dump *),"output:dump");
+    memory->grow(mode_dump,max_dump,"output:mode_dump");
     memory->grow(every_dump,max_dump,"output:every_dump");
+    memory->grow(every_time_dump,max_dump,"output:every_time_dump");
     memory->grow(next_dump,max_dump,"output:next_dump");
+    memory->grow(next_time_dump,max_dump,"output:next_time_dump");
     memory->grow(last_dump,max_dump,"output:last_dump");
     var_dump = (char **)
       memory->srealloc(var_dump,max_dump*sizeof(char *),"output:var_dump");
     memory->grow(ivar_dump,max_dump,"output:ivar_dump");
   }
 
-  // initialize per-dump data to suitable default values
-
-  every_dump[ndump] = 0;
-  last_dump[ndump] = -1;
-  var_dump[ndump] = nullptr;
-  ivar_dump[ndump] = -1;
-
   // create the Dump
 
   if (dump_map->find(arg[2]) != dump_map->end()) {
@@ -569,10 +758,17 @@ void Output::add_dump(int narg, char **arg)
     dump[ndump] = dump_creator(lmp, narg, arg);
   } else error->all(FLERR,utils::check_packages_for_style("dump",arg[2],lmp));
 
+  // initialize per-dump data to suitable default values
+
+  mode_dump[ndump] = 0;
   every_dump[ndump] = utils::inumeric(FLERR,arg[3],false,lmp);
   if (every_dump[ndump] <= 0) error->all(FLERR,"Illegal dump command");
+  every_time_dump[ndump] = 0.0;
+  next_time_dump[ndump] = -1.0;
   last_dump[ndump] = -1;
   var_dump[ndump] = nullptr;
+  ivar_dump[ndump] = -1;
+
   ndump++;
 }
 
@@ -624,8 +820,11 @@ void Output::delete_dump(char *id)
 
   for (int i = idump+1; i < ndump; i++) {
     dump[i-1] = dump[i];
+    mode_dump[i-1] = mode_dump[i];
     every_dump[i-1] = every_dump[i];
+    every_time_dump[i-1] = every_time_dump[i];
     next_dump[i-1] = next_dump[i];
+    next_time_dump[i-1] = next_time_dump[i];
     last_dump[i-1] = last_dump[i];
     var_dump[i-1] = var_dump[i];
     ivar_dump[i-1] = ivar_dump[i];
@@ -782,12 +981,12 @@ void Output::create_restart(int narg, char **arg)
   }
 
   int mpiioflag;
-  if (strstr(arg[1],".mpi")) mpiioflag = 1;
+  if (utils::strmatch(arg[1],"\\.mpiio$")) mpiioflag = 1;
   else mpiioflag = 0;
   if (nfile == 2) {
-    if (mpiioflag && !strstr(arg[2],".mpi"))
+    if (mpiioflag && !utils::strmatch(arg[2],"\\.mpiio$"))
       error->all(FLERR,"Both restart files must use MPI-IO or neither");
-    if (!mpiioflag && strstr(arg[2],".mpi"))
+    if (!mpiioflag && utils::strmatch(arg[2],"\\.mpiio$"))
       error->all(FLERR,"Both restart files must use MPI-IO or neither");
   }
 
diff --git a/src/output.h b/src/output.h
index 9ae8b7fc3d..3f557bdef5 100644
--- a/src/output.h
+++ b/src/output.h
@@ -35,12 +35,17 @@ class Output : protected Pointers {
 
   int ndump;               // # of Dumps defined
   int max_dump;            // max size of Dump list
-  bigint next_dump_any;    // next timestep for any Dump
-  int *every_dump;         // write freq for each Dump, 0 if var
-  bigint *next_dump;       // next timestep to do each Dump
+  bigint next_dump_any;    // next timestep for any dump
+  bigint next_time_dump_any; // next timestep for any time dump with computes
+  int any_time_dumps;      // 1 if any time dump defined
+  int *mode_dump;          // 0/1 if write every N timesteps or Delta in sim time
+  int *every_dump;         // dump every N timesteps, 0 if variable
+  double *every_time_dump; // dump every Delta of sim time, 0.0 if variable
+  bigint *next_dump;       // next timestep to perform dump
+  double *next_time_dump;  // next simulation time to perform dump (mode = 1)
   bigint *last_dump;       // last timestep each snapshot was output
-  char **var_dump;         // variable name for dump frequency
-  int *ivar_dump;          // variable index for dump frequency
+  char **var_dump;         // variable name for next dump (steps or sim time)
+  int *ivar_dump;          // variable index of var_dump name
   Dump **dump;             // list of defined Dumps
 
   int restart_flag;               // 1 if any restart files are written
@@ -72,12 +77,14 @@ class Output : protected Pointers {
   void write(bigint);             // output for current timestep
   void write_dump(bigint);        // force output of dump snapshots
   void write_restart(bigint);     // force output of a restart file
-  void reset_timestep(bigint);    // reset next timestep for all output
+  void reset_timestep(bigint);    // reset output which depends on timestep
+  void reset_dt();                // reset output which depends on timestep size
 
   void add_dump(int, char **);       // add a Dump to Dump list
   void modify_dump(int, char **);    // modify a Dump
   void delete_dump(char *);          // delete a Dump from Dump list
   int find_dump(const char *);       // find a Dump ID
+  int check_time_dumps(bigint);      // check if any time dump is output now
 
   void set_thermo(int, char **);        // set thermo output freqquency
   void create_thermo(int, char **);     // create a thermo style
@@ -87,6 +94,7 @@ class Output : protected Pointers {
 
  private:
   template <typename T> static Dump *dump_creator(LAMMPS *, int, char **);
+  void calculate_next_dump(int, int, bigint);
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/pair.cpp b/src/pair.cpp
index 1039875718..f88c4e0972 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -41,6 +41,7 @@ using namespace LAMMPS_NS;
 using namespace MathConst;
 
 enum{NONE,RLINEAR,RSQ,BMP};
+static const std::string mixing_rule_names[Pair::SIXTHPOWER+1] = {"geometric", "arithmetic", "sixthpower" };
 
 // allocate space for static class instance variable and initialize it
 
@@ -66,6 +67,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
   finitecutflag = 0;
   ghostneigh = 0;
   unit_convert_flag = utils::NOCONVERT;
+  did_mix = false;
 
   nextra = 0;
   pvector = nullptr;
@@ -217,11 +219,9 @@ void Pair::init()
   if (tail_flag && domain->nonperiodic && comm->me == 0)
     error->warning(FLERR,"Using pair tail corrections with non-periodic system");
   if (!compute_flag && tail_flag && comm->me == 0)
-    error->warning(FLERR,"Using pair tail corrections with "
-                   "pair_modify compute no");
+    error->warning(FLERR,"Using pair tail corrections with pair_modify compute no");
   if (!compute_flag && offset_flag && comm->me == 0)
-    error->warning(FLERR,"Using pair potential shift with "
-                   "pair_modify compute no");
+    error->warning(FLERR,"Using pair potential shift with pair_modify compute no");
 
   // for manybody potentials
   // check if bonded exclusions could invalidate the neighbor list
@@ -259,13 +259,18 @@ void Pair::init()
   etail = ptail = 0.0;
   mixed_flag = 1;
   double cut;
+  int mixed_count = 0;
 
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
-      if ((i != j) && setflag[i][j]) mixed_flag = 0;
+      did_mix = false;
       cut = init_one(i,j);
       cutsq[i][j] = cutsq[j][i] = cut*cut;
       cutforce = MAX(cutforce,cut);
+      if (i != j) {
+        if (setflag[i][j]) mixed_flag = 0;
+        if (did_mix) ++mixed_count;
+      }
       if (tail_flag) {
         etail += etail_ij;
         ptail += ptail_ij;
@@ -275,6 +280,12 @@ void Pair::init()
         }
       }
     }
+
+  if (!manybody_flag && (comm->me == 0)) {
+    const int num_mixed_pairs = atom->ntypes * (atom->ntypes - 1) / 2;
+    utils::logmesg(lmp,"  generated {} of {} mixed pair_coeff terms from {} mixing rule\n",
+                   mixed_count, num_mixed_pairs, mixing_rule_names[mix_flag]);
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -681,6 +692,7 @@ void Pair::free_disp_tables()
 
 double Pair::mix_energy(double eps1, double eps2, double sig1, double sig2)
 {
+  did_mix = true;
   if (mix_flag == GEOMETRIC)
     return sqrt(eps1*eps2);
   else if (mix_flag == ARITHMETIC)
@@ -688,7 +700,8 @@ double Pair::mix_energy(double eps1, double eps2, double sig1, double sig2)
   else if (mix_flag == SIXTHPOWER)
     return (2.0 * sqrt(eps1*eps2) *
       pow(sig1,3.0) * pow(sig2,3.0) / (pow(sig1,6.0) + pow(sig2,6.0)));
-  else return 0.0;
+  else did_mix = false;
+  return 0.0;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/pair.h b/src/pair.h
index f37c0732ed..00e6734773 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -110,6 +110,7 @@ class Pair : protected Pointers {
                        //       public so external driver can check
   int compute_flag;    // 0 if skip compute()
   int mixed_flag;      // 1 if all itype != jtype coeffs are from mixing
+  bool did_mix;        // set to true by mix_energy() to indicate that mixing was performed
 
   enum { GEOMETRIC, ARITHMETIC, SIXTHPOWER };    // mixing options
 
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index b5daa111da..e962e02c9e 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -706,11 +706,10 @@ double PairHybrid::init_one(int i, int j)
   for (int k = 0; k < nmap[i][j]; k++) {
     map[j][i][k] = map[i][j][k];
     double cut = styles[map[i][j][k]]->init_one(i,j);
-    styles[map[i][j][k]]->cutsq[i][j] =
-      styles[map[i][j][k]]->cutsq[j][i] = cut*cut;
+    if (styles[map[i][j][k]]->did_mix) did_mix = true;
+    styles[map[i][j][k]]->cutsq[i][j] = styles[map[i][j][k]]->cutsq[j][i] = cut*cut;
     if (styles[map[i][j][k]]->ghostneigh)
-      cutghost[i][j] = cutghost[j][i] =
-        MAX(cutghost[i][j],styles[map[i][j][k]]->cutghost[i][j]);
+      cutghost[i][j] = cutghost[j][i] = MAX(cutghost[i][j],styles[map[i][j][k]]->cutghost[i][j]);
     if (tail_flag) {
       etail_ij += styles[map[i][j][k]]->etail_ij;
       ptail_ij += styles[map[i][j][k]]->ptail_ij;
diff --git a/src/platform.cpp b/src/platform.cpp
index c701c37a80..708a42be8a 100644
--- a/src/platform.cpp
+++ b/src/platform.cpp
@@ -446,11 +446,11 @@ int platform::putenv(const std::string &vardef)
 
   auto found = vardef.find_first_of('=');
 #ifdef _WIN32
-  // must assign a value to variable with _putenv()
+  // must assign a value to variable with _putenv_s()
   if (found == std::string::npos)
-    return _putenv(utils::strdup(vardef + "=1"));
+    return _putenv_s(vardef.c_str(), "1");
   else
-    return _putenv(utils::strdup(vardef));
+    return _putenv_s(vardef.substr(0, found).c_str(), vardef.substr(found+1).c_str());
 #else
   if (found == std::string::npos)
     return setenv(vardef.c_str(), "", 1);
@@ -460,6 +460,24 @@ int platform::putenv(const std::string &vardef)
   return -1;
 }
 
+/* ----------------------------------------------------------------------
+   unset environment variable
+------------------------------------------------------------------------- */
+
+int platform::unsetenv(const std::string &variable)
+{
+  if (variable.size() == 0) return -1;
+#ifdef _WIN32
+  // emulate POSIX semantics by returning -1 on trying to unset non-existing variable
+  const char *ptr = getenv(variable.c_str());
+  if (!ptr) return -1;
+  // empty _putenv_s() definition deletes variable
+  return _putenv_s(variable.c_str(),"");
+#else
+  return ::unsetenv(variable.c_str());
+#endif
+}
+
 /* ----------------------------------------------------------------------
    split a "path" environment variable into a list
 ------------------------------------------------------------------------- */
diff --git a/src/platform.h b/src/platform.h
index 95a0c3cc35..c079fd2cd1 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -125,6 +125,13 @@ namespace platform {
 
   int putenv(const std::string &vardef);
 
+  /*! Delete variable from the environment
+   *
+   * \param  variable  variable name
+   * \return -1 if failure otherwise 0 */
+
+  int unsetenv(const std::string &variable);
+
   /*! Get list of entries in a path environment variable
    *
    * This provides a list of strings of the entries in an environment
diff --git a/src/read_data.cpp b/src/read_data.cpp
index 7e5d49e65c..89b3ecadec 100644
--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@@ -35,6 +35,7 @@
 #include "molecule.h"
 #include "pair.h"
 #include "special.h"
+#include "tokenizer.h"
 #include "update.h"
 
 #include <cctype>
@@ -1088,37 +1089,32 @@ void ReadData::header(int firstpass)
     } else if (utils::strmatch(line,"^\\s*\\d+\\s+atom\\s+types\\s")) {
       rv = sscanf(line,"%d",&ntypes);
       if (rv != 1)
-        error->all(FLERR,"Could not parse 'atom types' line "
-                   "in data file header");
+        error->all(FLERR,"Could not parse 'atom types' line in data file header");
       if (addflag == NONE) atom->ntypes = ntypes + extra_atom_types;
 
     } else if (utils::strmatch(line,"\\s*\\d+\\s+bond\\s+types\\s")) {
       rv = sscanf(line,"%d",&nbondtypes);
       if (rv != 1)
-        error->all(FLERR,"Could not parse 'bond types' line "
-                   "in data file header");
+        error->all(FLERR,"Could not parse 'bond types' line in data file header");
       if (addflag == NONE) atom->nbondtypes = nbondtypes + extra_bond_types;
 
     } else if (utils::strmatch(line,"^\\s*\\d+\\s+angle\\s+types\\s")) {
       rv = sscanf(line,"%d",&nangletypes);
       if (rv != 1)
-        error->all(FLERR,"Could not parse 'angle types' line "
-                   "in data file header");
+        error->all(FLERR,"Could not parse 'angle types' line in data file header");
       if (addflag == NONE) atom->nangletypes = nangletypes + extra_angle_types;
 
     } else if (utils::strmatch(line,"^\\s*\\d+\\s+dihedral\\s+types\\s")) {
       rv = sscanf(line,"%d",&ndihedraltypes);
       if (rv != 1)
-        error->all(FLERR,"Could not parse 'dihedral types' line "
-                   "in data file header");
+        error->all(FLERR,"Could not parse 'dihedral types' line in data file header");
       if (addflag == NONE)
         atom->ndihedraltypes = ndihedraltypes + extra_dihedral_types;
 
     } else if (utils::strmatch(line,"^\\s*\\d+\\s+improper\\s+types\\s")) {
       rv = sscanf(line,"%d",&nimpropertypes);
       if (rv != 1)
-        error->all(FLERR,"Could not parse 'improper types' line "
-                   "in data file header");
+        error->all(FLERR,"Could not parse 'improper types' line in data file header");
       if (addflag == NONE)
         atom->nimpropertypes = nimpropertypes + extra_improper_types;
 
@@ -1658,12 +1654,12 @@ void ReadData::bonus(bigint nbonus, AtomVec *ptr, const char *type)
    read all body data
    variable amount of info per body, described by ninteger and ndouble
    to find atoms, must build atom map if not a molecular system
-   if not firstpass, just read past data, but no processing of data
+   if not firstpass, just read past body data and only process body header
 ------------------------------------------------------------------------- */
 
 void ReadData::bodies(int firstpass, AtomVec *ptr)
 {
-  int m,nchunk,nline,nmax,ninteger,ndouble,nword,ncount,onebody,tmp,rv;
+  int m,nchunk,nline,nmax,ninteger,ndouble,nword,ncount,onebody;
   char *eof;
 
   int mapflag = 0;
@@ -1677,11 +1673,11 @@ void ReadData::bodies(int firstpass, AtomVec *ptr)
   // nchunk = actual # read
 
   bigint nread = 0;
-  bigint natoms = nbodies;
+  bigint nblocks = nbodies;
 
-  while (nread < natoms) {
-    if (natoms-nread > CHUNK) nmax = CHUNK;
-    else nmax = natoms-nread;
+  while (nread < nblocks) {
+    if (nblocks-nread > CHUNK) nmax = CHUNK;
+    else nmax = nblocks-nread;
 
     if (me == 0) {
       nchunk = 0;
@@ -1690,11 +1686,25 @@ void ReadData::bodies(int firstpass, AtomVec *ptr)
 
       while (nchunk < nmax && nline <= CHUNK-MAXBODY) {
         eof = utils::fgets_trunc(&buffer[m],MAXLINE,fp);
+        const char *buf = &buffer[m];
         if (eof == nullptr) error->one(FLERR,"Unexpected end of data file");
-        rv = sscanf(&buffer[m],"%d %d %d",&tmp,&ninteger,&ndouble);
-        if (rv != 3)
-          error->one(FLERR,"Incorrect format in Bodies section of data file");
-        m += strlen(&buffer[m]);
+        try {
+          auto values = ValueTokenizer(utils::trim_comment(buf));
+          tagint tagdata = values.next_tagint() + id_offset;
+          ninteger = values.next_int();
+          ndouble = values.next_double();
+          if (tagdata <= 0 || tagdata > atom->map_tag_max)
+            throw TokenizerException("Invalid atom ID in body header", utils::trim(buf));
+          if (ninteger < 0)
+            throw TokenizerException("Invalid number of integers", utils::trim(buf));
+          if (ndouble < 0)
+            throw TokenizerException("Invalid number of doubles", utils::trim(buf));
+          if (values.has_next())
+            throw TokenizerException("Too many tokens in body header", utils::trim(buf));
+        } catch (TokenizerException &e) {
+          error->one(FLERR,std::string(e.what()) + " while reading Bodies section of data file");
+        }
+        m += strlen(buf);
 
         // read lines one at a time into buffer and count words
         // count to ninteger and ndouble until have enough lines
@@ -1754,7 +1764,7 @@ void ReadData::bodies(int firstpass, AtomVec *ptr)
   }
 
   if (me == 0 && firstpass)
-    utils::logmesg(lmp,"  {} bodies\n",natoms);
+    utils::logmesg(lmp,"  {} bodies\n",nblocks);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/reader.cpp b/src/reader.cpp
index f2652d868d..eb8cd9ffb6 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -25,11 +25,13 @@ using namespace LAMMPS_NS;
 Reader::Reader(LAMMPS *lmp) : Pointers(lmp)
 {
   fp = nullptr;
+  binary = false;
+  compressed = false;
 }
 
 /* ----------------------------------------------------------------------
    try to open given file
-   generic version for ASCII files that may be compressed
+   generic version for ASCII files with optional compression or for native binary dumps
 ------------------------------------------------------------------------- */
 
 void Reader::open_file(const std::string &file)
@@ -37,12 +39,18 @@ void Reader::open_file(const std::string &file)
   if (fp != nullptr) close_file();
 
   if (platform::has_compress_extension(file)) {
-    compressed = 1;
+    compressed = true;
     fp = platform::compressed_read(file);
     if (!fp) error->one(FLERR,"Cannot open compressed file for reading");
   } else {
-    compressed = 0;
-    fp = fopen(file.c_str(),"r");
+    compressed = false;
+    if (utils::strmatch(file, "\\.bin$")) {
+      binary = true;
+      fp = fopen(file.c_str(),"rb");
+    } else {
+      fp = fopen(file.c_str(),"r");
+      binary = false;
+    }
   }
 
   if (!fp) error->one(FLERR,"Cannot open file {}: {}", file, utils::getsyserror());
diff --git a/src/reader.h b/src/reader.h
index 18977790cd..097fa2526b 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -38,7 +38,8 @@ class Reader : protected Pointers {
 
  protected:
   FILE *fp;          // pointer to opened file or pipe
-  int compressed;    // flag for dump file compression
+  bool compressed;   // flag for dump file compression
+  bool binary;       // flag for (native) binary files
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/reader_native.cpp b/src/reader_native.cpp
index b490410d02..ba7a576a50 100644
--- a/src/reader_native.cpp
+++ b/src/reader_native.cpp
@@ -36,14 +36,17 @@ ReaderNative::ReaderNative(LAMMPS *lmp) : Reader(lmp)
 {
   line = new char[MAXLINE];
   fieldindex = nullptr;
+  maxbuf = 0;
+  databuf = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ReaderNative::~ReaderNative()
 {
-  delete [] line;
+  delete[] line;
   memory->destroy(fieldindex);
+  memory->destroy(databuf);
 }
 
 /* ----------------------------------------------------------------------
@@ -54,25 +57,54 @@ ReaderNative::~ReaderNative()
 
 int ReaderNative::read_time(bigint &ntimestep)
 {
-  char *eof = fgets(line,MAXLINE,fp);
-  if (eof == nullptr) return 1;
+  if (binary) {
+    int endian = 0x0001;
+    revision = 0x0001;
+    magic_string = "";
+    unit_style = "";
 
-  // skip over unit and time information, if present.
+    auto ret = fread(&ntimestep, sizeof(bigint), 1, fp);
 
-  if (utils::strmatch(line,"^\\s*ITEM: UNITS\\s*$"))
-    read_lines(2);
+    // detect end-of-file
+    if (ret != 1 || feof(fp)) return 1;
 
-  if (utils::strmatch(line,"^\\s*ITEM: TIME\\s*$"))
-    read_lines(2);
+    // detect newer format
+    if (ntimestep < 0) {
+      // first bigint encodes negative format name length
+      bigint magic_string_len = -ntimestep;
 
-  if (!utils::strmatch(line,"^\\s*ITEM: TIMESTEP\\s*$"))
-    error->one(FLERR,"Dump file is incorrectly formatted");
+      magic_string = read_binary_str(magic_string_len);
 
-  read_lines(1);
-  int rv = sscanf(line,BIGINT_FORMAT,&ntimestep);
-  if (rv != 1)
-    error->one(FLERR,"Dump file is incorrectly formatted");
+      // read endian flag
+      read_buf(&endian, sizeof(int), 1);
 
+      // read revision number
+      read_buf(&revision, sizeof(int), 1);
+
+      // read the real ntimestep
+      read_buf(&ntimestep, sizeof(bigint), 1);
+    }
+
+  } else {
+    char *eof = fgets(line,MAXLINE,fp);
+    if (eof == nullptr) return 1;
+
+    // skip over unit and time information, if present.
+
+    if (utils::strmatch(line,"^\\s*ITEM: UNITS\\s*$"))
+      read_lines(2);
+
+    if (utils::strmatch(line,"^\\s*ITEM: TIME\\s*$"))
+      read_lines(2);
+
+    if (!utils::strmatch(line,"^\\s*ITEM: TIMESTEP\\s*$"))
+      error->one(FLERR,"Dump file is incorrectly formatted");
+
+    read_lines(1);
+    int rv = sscanf(line,BIGINT_FORMAT,&ntimestep);
+    if (rv != 1)
+      error->one(FLERR,"Dump file is incorrectly formatted");
+  }
   return 0;
 }
 
@@ -83,22 +115,69 @@ int ReaderNative::read_time(bigint &ntimestep)
 
 void ReaderNative::skip()
 {
-  read_lines(2);
-  bigint natoms;
-  int rv = sscanf(line,BIGINT_FORMAT,&natoms);
-  if (rv != 1)
-    error->one(FLERR,"Dump file is incorrectly formatted");
+  if (binary) {
+    int triclinic;
+    skip_buf(sizeof(bigint));
+    read_buf(&triclinic, sizeof(int), 1);
+    skip_buf((sizeof(int)+sizeof(double))*6);
+    if (triclinic) {
+      skip_buf(sizeof(double)*3);
+    }
+    skip_buf(sizeof(int));
 
-  read_lines(5);
+    skip_reading_magic_str();
 
-  // invoke read_lines() in chunks no larger than MAXSMALLINT
+    // read chunk and skip them
 
-  int nchunk;
-  bigint nremain = natoms;
-  while (nremain) {
-    nchunk = MIN(nremain,MAXSMALLINT);
-    read_lines(nchunk);
-    nremain -= nchunk;
+    read_buf(&nchunk, sizeof(int), 1);
+
+    int n;
+    for (int i = 0; i < nchunk; i++) {
+      read_buf(&n, sizeof(int), 1);
+      skip_buf(n*sizeof(double));
+    }
+
+  } else {
+    read_lines(2);
+    bigint natoms;
+    int rv = sscanf(line,BIGINT_FORMAT,&natoms);
+    if (rv != 1)
+      error->one(FLERR,"Dump file is incorrectly formatted");
+
+    read_lines(5);
+
+    // invoke read_lines() in chunks no larger than MAXSMALLINT
+
+    int nchunk;
+    bigint nremain = natoms;
+    while (nremain) {
+      nchunk = MIN(nremain,MAXSMALLINT);
+      read_lines(nchunk);
+      nremain -= nchunk;
+    }
+  }
+}
+
+void ReaderNative::skip_reading_magic_str()
+{
+  if (is_known_magic_str() && revision > 0x0001) {
+    int len;
+    read_buf(&len, sizeof(int), 1);
+
+    if (len > 0) {
+      // has units
+      skip_buf(sizeof(char)*len);
+    }
+
+    char flag = 0;
+    read_buf(&flag, sizeof(char), 1);
+
+    if (flag) {
+      skip_buf(sizeof(double));
+    }
+
+    read_buf(&len, sizeof(int), 1);
+    skip_buf(sizeof(char)*len);
   }
 }
 
@@ -123,53 +202,114 @@ bigint ReaderNative::read_header(double box[3][3], int &boxinfo, int &triclinic,
                                  int scaleflag, int wrapflag, int &fieldflag,
                                  int &xflag, int &yflag, int &zflag)
 {
-  bigint natoms;
-  int rv;
+  bigint natoms = 0;
+  int len = 0;
+  std::string labelline;
 
-  read_lines(2);
-  rv = sscanf(line,BIGINT_FORMAT,&natoms);
-  if (rv != 1)
-    error->one(FLERR,"Dump file is incorrectly formatted");
+  if (binary) {
+    read_buf(&natoms, sizeof(bigint), 1);
 
-  boxinfo = 1;
-  triclinic = 0;
-  box[0][2] = box[1][2] = box[2][2] = 0.0;
-  read_lines(1);
-  if (line[strlen("ITEM: BOX BOUNDS ")] == 'x') triclinic = 1;
+    boxinfo = 1;
+    triclinic = 0;
+    box[0][2] = box[1][2] = box[2][2] = 0.0;
 
-  read_lines(1);
-  if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[0][0],&box[0][1]);
-  else rv = 3 - sscanf(line,"%lg %lg %lg",&box[0][0],&box[0][1],&box[0][2]);
-  if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+    int boundary[3][2];
+    read_buf(&triclinic, sizeof(int), 1);
+    read_buf(&boundary[0][0], sizeof(int), 6);
+    read_buf(box[0], sizeof(double), 2);
+    read_buf(box[1], sizeof(double), 2);
+    read_buf(box[2], sizeof(double), 2);
+    if (triclinic) {
+      read_buf(&box[0][2], sizeof(double), 1);
+      read_buf(&box[1][2], sizeof(double), 1);
+      read_buf(&box[2][2], sizeof(double), 1);
+    }
 
-  read_lines(1);
-  if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[1][0],&box[1][1]);
-  else rv = 3 - sscanf(line,"%lg %lg %lg",&box[1][0],&box[1][1],&box[1][2]);
-  if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+    // extract column labels and match to requested fields
+    read_buf(&size_one, sizeof(int), 1);
 
-  read_lines(1);
-  if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[2][0],&box[2][1]);
-  else rv = 3 - sscanf(line,"%lg %lg %lg",&box[2][0],&box[2][1],&box[2][2]);
-  if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+    if (!fieldinfo) {
+      skip_reading_magic_str();
+      return natoms;
+    }
 
-  read_lines(1);
 
-  // if no field info requested, just return
+    if (is_known_magic_str() && revision > 0x0001) {
+      // newer format includes units string, columns string
+      // and time
+      read_buf(&len, sizeof(int), 1);
 
-  if (!fieldinfo) return natoms;
+      if (len > 0) {
+        // has units
+        unit_style = read_binary_str(len);
+      }
 
-  // exatract column labels and match to requested fields
+      char flag = 0;
+      read_buf(&flag, sizeof(char), 1);
 
-  char *labelline = &line[strlen("ITEM: ATOMS ")];
+      if (flag) {
+        double time;
+        read_buf(&time, sizeof(double), 1);
+      }
+
+      read_buf(&len, sizeof(int), 1);
+      labelline = read_binary_str(len);
+    } else {
+      error->one(FLERR, "Unsupported old binary dump format");
+    }
+
+    read_buf(&nchunk, sizeof(int), 1);
+    ichunk = 0;
+    iatom_chunk = 0;
+  } else {
+    int rv;
+
+    read_lines(2);
+    rv = sscanf(line,BIGINT_FORMAT,&natoms);
+    if (rv != 1)
+      error->one(FLERR,"Dump file is incorrectly formatted");
+
+    boxinfo = 1;
+    triclinic = 0;
+    box[0][2] = box[1][2] = box[2][2] = 0.0;
+    read_lines(1);
+    if (line[strlen("ITEM: BOX BOUNDS ")] == 'x') triclinic = 1;
+
+    read_lines(1);
+    if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[0][0],&box[0][1]);
+    else rv = 3 - sscanf(line,"%lg %lg %lg",&box[0][0],&box[0][1],&box[0][2]);
+    if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+
+    read_lines(1);
+    if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[1][0],&box[1][1]);
+    else rv = 3 - sscanf(line,"%lg %lg %lg",&box[1][0],&box[1][1],&box[1][2]);
+    if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+
+    read_lines(1);
+    if (!triclinic) rv = 2 - sscanf(line,"%lg %lg",&box[2][0],&box[2][1]);
+    else rv = 3 - sscanf(line,"%lg %lg %lg",&box[2][0],&box[2][1],&box[2][2]);
+    if (rv != 0) error->one(FLERR,"Dump file is incorrectly formatted");
+
+    read_lines(1);
+
+    // if no field info requested, just return
+
+    if (!fieldinfo) return natoms;
+
+    // extract column labels and match to requested fields
+
+    labelline = &line[strlen("ITEM: ATOMS ")];
+  }
 
-  std::map<std::string, int> labels;
   Tokenizer tokens(labelline);
+  std::map<std::string, int> labels;
   nwords = 0;
 
   while (tokens.has_next()) {
     labels[tokens.next()] = nwords++;
   }
 
+
   if (nwords == 0) {
     return 1;
   }
@@ -311,22 +451,53 @@ bigint ReaderNative::read_header(double box[3][3], int &boxinfo, int &triclinic,
 
 void ReaderNative::read_atoms(int n, int nfield, double **fields)
 {
-  int i,m;
-  char *eof;
+  if (binary) {
+    if (feof(fp)) {
+      error->one(FLERR,"Unexpected end of dump file");
+    }
 
-  for (i = 0; i < n; i++) {
-    eof = fgets(line,MAXLINE,fp);
-    if (eof == nullptr) error->one(FLERR,"Unexpected end of dump file");
+    // read chunks until n atoms have been read
+    int m = size_one*iatom_chunk;
 
-    // tokenize the line
-    std::vector<std::string> words = Tokenizer(line).as_vector();
+    for (int i = 0; i < n; i++) {
+      // if the last chunk has finished
+      if (iatom_chunk == 0) {
+          read_buf(&natom_chunk, sizeof(int), 1);
+          read_double_chunk(natom_chunk);
+          natom_chunk /= size_one;
+          m = 0;
+      }
 
-    if ((int)words.size() < nwords) error->one(FLERR,"Insufficient columns in dump file");
+      // read one line of atom
+      double *words = &databuf[m];
 
-    // convert selected fields to floats
+      for (int k = 0; k < nfield; k++)
+        fields[i][k] = words[fieldindex[k]];
 
-    for (m = 0; m < nfield; m++)
-      fields[i][m] = atof(words[fieldindex[m]].c_str());
+      m += size_one;
+
+      iatom_chunk++;
+
+      // hit the end of current chunk
+      if (iatom_chunk == natom_chunk) {
+        iatom_chunk = 0;
+        ichunk++;
+      }
+    }
+  } else {
+    for (int i = 0; i < n; i++) {
+      utils::sfgets(FLERR, line, MAXLINE, fp, nullptr, error);
+
+      // tokenize the line
+      std::vector<std::string> words = Tokenizer(line).as_vector();
+
+      if ((int)words.size() < nwords) error->one(FLERR,"Insufficient columns in dump file");
+
+      // convert selected fields to floats
+
+      for (int m = 0; m < nfield; m++)
+        fields[i][m] = atof(words[fieldindex[m]].c_str());
+    }
   }
 }
 
@@ -352,8 +523,41 @@ int ReaderNative::find_label(const std::string &label, const std::map<std::strin
 
 void ReaderNative::read_lines(int n)
 {
-  char *eof = nullptr;
-  if (n <= 0) return;
-  for (int i = 0; i < n; i++) eof = fgets(line,MAXLINE,fp);
-  if (eof == nullptr) error->one(FLERR,"Unexpected end of dump file");
+  for (int i = 0; i < n; i++) {
+    utils::sfgets(FLERR, line, MAXLINE, fp, nullptr, error);
+  }
+}
+
+void ReaderNative::read_buf(void * ptr, size_t size, size_t count)
+{
+  utils::sfread(FLERR, ptr, size, count, fp, nullptr, error);
+}
+
+std::string ReaderNative::read_binary_str(size_t size)
+{
+  std::string str(size, '\0');
+  read_buf(&str[0], sizeof(char), size);
+  return str;
+}
+
+void ReaderNative::read_double_chunk(size_t count)
+{
+  // extend buffer to fit chunk size
+  if (count > maxbuf) {
+    memory->grow(databuf,count,"reader:databuf");
+    maxbuf = count;
+  }
+  read_buf(databuf, sizeof(double), count);
+}
+
+void ReaderNative::skip_buf(size_t size)
+{
+  bigint pos = platform::ftell(fp);
+  pos += size;
+  platform::fseek(fp,pos);
+}
+
+bool ReaderNative::is_known_magic_str() const
+{
+  return magic_string == "DUMPATOM" || magic_string == "DUMPCUSTOM";
 }
diff --git a/src/reader_native.h b/src/reader_native.h
index 6b171ce77a..f888509dfb 100644
--- a/src/reader_native.h
+++ b/src/reader_native.h
@@ -24,6 +24,7 @@ ReaderStyle(native,ReaderNative);
 
 #include "reader.h"
 
+#include <string>
 #include <map>
 
 namespace LAMMPS_NS {
@@ -40,13 +41,32 @@ class ReaderNative : public Reader {
   void read_atoms(int, int, double **);
 
  private:
-  char *line;    // line read from dump file
+  int revision;
 
+  std::string magic_string;
+  std::string unit_style;
+  int *fieldindex;
+
+  char *line;         // line read from dump file
+  double *databuf;    // buffer for binary data
   int nwords;         // # of per-atom columns in dump file
-  int *fieldindex;    //
+
+  int size_one;       // number of double for one atom
+  size_t maxbuf;      // maximum buffer size
+  int nchunk;         // number of chunks in the binary file
+  int ichunk;         // index of current reading chunk
+  int natom_chunk;    // number of atoms in the current chunks
+  int iatom_chunk;    // index of current atom in the current chunk
 
   int find_label(const std::string &label, const std::map<std::string, int> &labels);
   void read_lines(int);
+
+  void read_buf(void *, size_t, size_t);
+  void read_double_chunk(size_t);
+  void skip_buf(size_t);
+  void skip_reading_magic_str();
+  bool is_known_magic_str() const;
+  std::string read_binary_str(size_t);
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/update.cpp b/src/update.cpp
index 95dc47573e..da412ee89e 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -458,7 +458,7 @@ Min *Update::minimize_creator(LAMMPS *lmp)
 }
 
 /* ----------------------------------------------------------------------
-   reset timestep as called from input script
+   reset timestep called from input script
 ------------------------------------------------------------------------- */
 
 void Update::reset_timestep(int narg, char **arg)
@@ -470,24 +470,33 @@ void Update::reset_timestep(int narg, char **arg)
 
 /* ----------------------------------------------------------------------
    reset timestep
-   called from rerun command and input script (indirectly)
+   called from input script (indirectly) or rerun command
 ------------------------------------------------------------------------- */
 
 void Update::reset_timestep(bigint newstep)
 {
+  if (newstep < 0) error->all(FLERR,"Timestep must be >= 0");
+
+  bigint oldstep = ntimestep;
   ntimestep = newstep;
-  if (ntimestep < 0) error->all(FLERR,"Timestep must be >= 0");
 
-  // set atimestep to new timestep
-  // so future update_time() calls will be correct
+  // if newstep >= oldstep, update simulation time accordingly
+  // if newstep < oldstep, zero simulation time
 
-  atimestep = ntimestep;
+  if (newstep >= oldstep) update_time();
 
-  // trigger reset of timestep for output
-  // do not allow any timestep-dependent fixes to be already defined
+  if (newstep < oldstep) {
+    atime = 0.0;
+    atimestep = newstep;
+  }
+
+  // changes to output that depend on timestep
+  // no active dumps allowed
 
   output->reset_timestep(ntimestep);
 
+  // do not allow timestep-dependent fixes to be defined
+
   for (const auto &ifix : modify->get_fix_list())
     if (ifix->time_depend)
       error->all(FLERR, "Cannot reset timestep with time-dependent fix {} defined",ifix->style);
@@ -508,7 +517,7 @@ void Update::reset_timestep(bigint newstep)
     if (icompute->timeflag) icompute->clearstep();
   }
 
-  // Neighbor Bin/Stencil/Pair classes store timestamps that need to be cleared
+  // neighbor Bin/Stencil/Pair classes store timestamps that need to be cleared
 
   neighbor->reset_timestep(ntimestep);
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index f70a60da7c..eabe86adbc 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -298,12 +298,9 @@ std::string utils::check_packages_for_style(const std::string &style, const std:
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
-int utils::logical(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+int utils::logical(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
 {
-  int n = 0;
-
-  if (str) n = strlen(str);
-  if (n == 0) {
+  if (str.empty()) {
     const char msg[] = "Expected boolean parameter instead of NULL or empty string "
                        "in input script or data file";
     if (do_abort)
@@ -332,18 +329,28 @@ int utils::logical(const char *file, int line, const char *str, bool do_abort, L
   return rv;
 }
 
+/* ----------------------------------------------------------------------
+   wrapper for logical() that accepts a char pointer instead of a string
+------------------------------------------------------------------------- */
+
+int utils::logical(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+{
+  if (str)
+    return logical(file, line, std::string(str), do_abort, lmp);
+  else
+    return logical(file, line, std::string(""), do_abort, lmp);
+}
+
 /* ----------------------------------------------------------------------
    read a floating point value from a string
    generate an error if not a legitimate floating point value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
-double utils::numeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+double utils::numeric(const char *file, int line, const std::string &str, bool do_abort,
+                      LAMMPS *lmp)
 {
-  int n = 0;
-
-  if (str) n = strlen(str);
-  if (n == 0) {
+  if (str.empty()) {
     const char msg[] = "Expected floating point parameter instead of"
                        " NULL or empty string in input script or data file";
     if (do_abort)
@@ -367,18 +374,27 @@ double utils::numeric(const char *file, int line, const char *str, bool do_abort
   return atof(buf.c_str());
 }
 
+/* ----------------------------------------------------------------------
+   wrapper for numeric() that accepts a char pointer instead of a string
+------------------------------------------------------------------------- */
+
+double utils::numeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+{
+  if (str)
+    return numeric(file, line, std::string(str), do_abort, lmp);
+  else
+    return numeric(file, line, std::string(""), do_abort, lmp);
+}
+
 /* ----------------------------------------------------------------------
    read an integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
-int utils::inumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+int utils::inumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp)
 {
-  int n = 0;
-
-  if (str) n = strlen(str);
-  if (n == 0) {
+  if (str.empty()) {
     const char msg[] = "Expected integer parameter instead of"
                        " NULL or empty string in input script or data file";
     if (do_abort)
@@ -402,18 +418,28 @@ int utils::inumeric(const char *file, int line, const char *str, bool do_abort,
   return atoi(buf.c_str());
 }
 
+/* ----------------------------------------------------------------------
+   wrapper for inumeric() that accepts a char pointer instead of a string
+------------------------------------------------------------------------- */
+
+int utils::inumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+{
+  if (str)
+    return inumeric(file, line, std::string(str), do_abort, lmp);
+  else
+    return inumeric(file, line, std::string(""), do_abort, lmp);
+}
+
 /* ----------------------------------------------------------------------
    read a big integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
-bigint utils::bnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+bigint utils::bnumeric(const char *file, int line, const std::string &str, bool do_abort,
+                       LAMMPS *lmp)
 {
-  int n = 0;
-
-  if (str) n = strlen(str);
-  if (n == 0) {
+  if (str.empty()) {
     const char msg[] = "Expected integer parameter instead of"
                        " NULL or empty string in input script or data file";
     if (do_abort)
@@ -437,18 +463,28 @@ bigint utils::bnumeric(const char *file, int line, const char *str, bool do_abor
   return ATOBIGINT(buf.c_str());
 }
 
+/* ----------------------------------------------------------------------
+   wrapper for bnumeric() that accepts a char pointer instead of a string
+------------------------------------------------------------------------- */
+
+bigint utils::bnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+{
+  if (str)
+    return bnumeric(file, line, std::string(str), do_abort, lmp);
+  else
+    return bnumeric(file, line, std::string(""), do_abort, lmp);
+}
+
 /* ----------------------------------------------------------------------
    read a tag integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
-tagint utils::tnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+tagint utils::tnumeric(const char *file, int line, const std::string &str, bool do_abort,
+                       LAMMPS *lmp)
 {
-  int n = 0;
-
-  if (str) n = strlen(str);
-  if (n == 0) {
+  if (str.empty()) {
     const char msg[] = "Expected integer parameter instead of"
                        " NULL or empty string in input script or data file";
     if (do_abort)
@@ -472,6 +508,18 @@ tagint utils::tnumeric(const char *file, int line, const char *str, bool do_abor
   return ATOTAGINT(buf.c_str());
 }
 
+/* ----------------------------------------------------------------------
+   wrapper for tnumeric() that accepts a char pointer instead of a string
+------------------------------------------------------------------------- */
+
+tagint utils::tnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp)
+{
+  if (str)
+    return tnumeric(file, line, std::string(str), do_abort, lmp);
+  else
+    return tnumeric(file, line, std::string(""), do_abort, lmp);
+}
+
 /* ----------------------------------------------------------------------
    compute bounds implied by numeric str with a possible wildcard asterisk
 ------------------------------------------------------------------------- */
diff --git a/src/utils.h b/src/utils.h
index 1feee26f27..47a4ace5f9 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -169,6 +169,17 @@ namespace utils {
    *  \param lmp      pointer to top-level LAMMPS class instance
    *  \return         1 if string resolves to "true", otherwise 0 */
 
+  int logical(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp);
+
+  /*! \overload
+   *
+   *  \param file     name of source file for error message
+   *  \param line     line number in source file for error message
+   *  \param str      string to be converted to logical
+   *  \param do_abort determines whether to call Error::one() or Error::all()
+   *  \param lmp      pointer to top-level LAMMPS class instance
+   *  \return         1 if string resolves to "true", otherwise 0 */
+
   int logical(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp);
 
   /*! Convert a string to a floating point number while checking
@@ -181,6 +192,17 @@ namespace utils {
    *  \param lmp      pointer to top-level LAMMPS class instance
    *  \return         double precision floating point number */
 
+  double numeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp);
+
+  /*! \overload
+   *
+   *  \param file     name of source file for error message
+   *  \param line     line number in source file for error message
+   *  \param str      string to be converted to number
+   *  \param do_abort determines whether to call Error::one() or Error::all()
+   *  \param lmp      pointer to top-level LAMMPS class instance
+   *  \return         double precision floating point number */
+
   double numeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp);
 
   /*! Convert a string to an integer number while checking
@@ -193,6 +215,17 @@ namespace utils {
    *  \param lmp      pointer to top-level LAMMPS class instance
    *  \return         integer number (regular int)  */
 
+  int inumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp);
+
+  /*! \overload
+   *
+   *  \param file     name of source file for error message
+   *  \param line     line number in source file for error message
+   *  \param str      string to be converted to number
+   *  \param do_abort determines whether to call Error::one() or Error::all()
+   *  \param lmp      pointer to top-level LAMMPS class instance
+   *  \return         double precision floating point number */
+
   int inumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp);
 
   /*! Convert a string to an integer number while checking
@@ -205,6 +238,17 @@ namespace utils {
    *  \param lmp      pointer to top-level LAMMPS class instance
    *  \return         integer number (bigint) */
 
+  bigint bnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp);
+
+  /*! \overload
+   *
+   *  \param file     name of source file for error message
+   *  \param line     line number in source file for error message
+   *  \param str      string to be converted to number
+   *  \param do_abort determines whether to call Error::one() or Error::all()
+   *  \param lmp      pointer to top-level LAMMPS class instance
+   *  \return         double precision floating point number */
+
   bigint bnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp);
 
   /*! Convert a string to an integer number while checking
@@ -217,6 +261,17 @@ namespace utils {
    * \param lmp      pointer to top-level LAMMPS class instance
    * \return         integer number (tagint) */
 
+  tagint tnumeric(const char *file, int line, const std::string &str, bool do_abort, LAMMPS *lmp);
+
+  /*! \overload
+   *
+   *  \param file     name of source file for error message
+   *  \param line     line number in source file for error message
+   *  \param str      string to be converted to number
+   *  \param do_abort determines whether to call Error::one() or Error::all()
+   *  \param lmp      pointer to top-level LAMMPS class instance
+   *  \return         double precision floating point number */
+
   tagint tnumeric(const char *file, int line, const char *str, bool do_abort, LAMMPS *lmp);
 
   /*! Compute index bounds derived from a string with a possible wildcard
diff --git a/src/verlet.cpp b/src/verlet.cpp
index aa180f5644..342dc3d951 100644
--- a/src/verlet.cpp
+++ b/src/verlet.cpp
@@ -54,7 +54,7 @@ void Verlet::init()
 
   bool do_time_integrate = false;
   for (const auto &fix : modify->get_fix_list())
-    if (fix->time_integrate) do_time_integrate;
+    if (fix->time_integrate) do_time_integrate = true;
 
   if (!do_time_integrate && (comm->me == 0))
     error->warning(FLERR,"No fixes with time integration, atoms won't move");
diff --git a/src/version.h b/src/version.h
index 21f49398e0..c33ca6ee15 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "27 Oct 2021"
+#define LAMMPS_VERSION "7 Jan 2022"
diff --git a/src/write_restart.cpp b/src/write_restart.cpp
index 661585605d..41532feaf3 100644
--- a/src/write_restart.cpp
+++ b/src/write_restart.cpp
@@ -73,9 +73,12 @@ void WriteRestart::command(int narg, char **arg)
 
   if (strchr(arg[0],'%')) multiproc = nprocs;
   else multiproc = 0;
-  if (strstr(arg[0],".mpiio")) mpiioflag = 1;
+  if (utils::strmatch(arg[0],"\\.mpiio$")) mpiioflag = 1;
   else mpiioflag = 0;
 
+  if ((comm->me == 0) && mpiioflag)
+    error->warning(FLERR,"MPI-IO output is unmaintained and unreliable. Use with caution.");
+
   // setup output style and process optional args
   // also called by Output class for periodic restart files
 
diff --git a/tools/lammps-shell/.clang-format b/tools/lammps-shell/.clang-format
new file mode 120000
index 0000000000..a20dd6aac4
--- /dev/null
+++ b/tools/lammps-shell/.clang-format
@@ -0,0 +1 @@
+../../unittest/.clang-format
\ No newline at end of file
diff --git a/tools/lammps-shell/lammps-shell.cpp b/tools/lammps-shell/lammps-shell.cpp
index d03e1da70b..19730b8552 100644
--- a/tools/lammps-shell/lammps-shell.cpp
+++ b/tools/lammps-shell/lammps-shell.cpp
@@ -10,8 +10,8 @@
 #include "utils.h"
 
 #include <cstring>
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 
@@ -33,9 +33,9 @@
 
 using namespace LAMMPS_NS;
 
-void *lmp = nullptr;
-char *omp_threads = nullptr;
-constexpr int BUFLEN  = 512;
+void *lmp            = nullptr;
+char *omp_threads    = nullptr;
+constexpr int BUFLEN = 512;
 char buf[BUFLEN];
 
 enum {
@@ -342,14 +342,13 @@ static char *plugin_generator(const char *text, int state)
 {
     const char *subcmd[] = {"load", "unload", "list", "clear", nullptr};
     const char *sub;
-    static std::size_t idx=0, len;
+    static std::size_t idx = 0, len;
     if (!state) idx = 0;
     len = strlen(text);
 
     while ((sub = subcmd[idx]) != nullptr) {
         ++idx;
-        if (strncmp(text,sub,len) == 0)
-            return dupstring(sub);
+        if (strncmp(text, sub, len) == 0) return dupstring(sub);
     }
     return nullptr;
 }
@@ -358,13 +357,12 @@ static char *plugin_style_generator(const char *text, int state)
 {
     const char *styles[] = {"pair", "fix", "command", nullptr};
     const char *s;
-    static std::size_t idx=0, len;
+    static std::size_t idx = 0, len;
     if (!state) idx = 0;
     len = strlen(text);
     while ((s = styles[idx]) != nullptr) {
         ++idx;
-        if (strncmp(text,s,len) == 0)
-            return dupstring(s);
+        if (strncmp(text, s, len) == 0) return dupstring(s);
     }
     return nullptr;
 }
@@ -376,7 +374,7 @@ static char *plugin_name_generator(const char *text, int state)
 
     static std::size_t idx, len, nmax;
     if (!state) idx = 0;
-    len = words[3].size();
+    len  = words[3].size();
     nmax = lammps_plugin_count();
 
     while (idx < nmax) {
@@ -384,8 +382,7 @@ static char *plugin_name_generator(const char *text, int state)
         lammps_plugin_name(idx, style, name, BUFLEN);
         ++idx;
         if (words[2] == style) {
-            if (strncmp(name, words[3].c_str(), len) == 0)
-                return dupstring(name);
+            if (strncmp(name, words[3].c_str(), len) == 0) return dupstring(name);
         }
     }
     return nullptr;
@@ -527,13 +524,11 @@ static char **cmd_completion(const char *text, int start, int)
         } else if (words.size() == 2) { // expand third word
 
             // these commands have a group name as 3rd word
-            if ((words[0] == "fix")
-                || (words[0] == "compute")
-                || (words[0] == "dump")) {
+            if ((words[0] == "fix") || (words[0] == "compute") || (words[0] == "dump")) {
                 matches = rl_completion_matches(text, group_generator);
             } else if (words[0] == "region") {
                 matches = rl_completion_matches(text, region_generator);
-            // plugin style is the third word
+                // plugin style is the third word
             } else if ((words[0] == "plugin") && (words[1] == "unload")) {
                 matches = rl_completion_matches(text, plugin_style_generator);
             }
@@ -546,7 +541,7 @@ static char **cmd_completion(const char *text, int start, int)
                 matches = rl_completion_matches(text, compute_generator);
             } else if (words[0] == "dump") {
                 matches = rl_completion_matches(text, dump_generator);
-            // plugin name is the fourth word
+                // plugin name is the fourth word
             } else if ((words[0] == "plugin") && (words[1] == "unload")) {
                 matches = rl_completion_matches(rl_line_buffer, plugin_name_generator);
             }
@@ -599,7 +594,7 @@ static void init_commands()
     // read saved history, but not in test mode.
     if (!test_mode) read_history(".lammps_history");
 
-    // intercept CTRL-C
+        // intercept CTRL-C
 #if defined(_WIN32)
     SetConsoleCtrlHandler(ctrl_c_handler, TRUE);
 #else
@@ -736,7 +731,7 @@ int main(int argc, char **argv)
     // switch to the user's documents directory.
 
     auto curdir = platform::current_directory();
-    if (utils::strmatch(curdir,"[Ss]ystem32")) {
+    if (utils::strmatch(curdir, "[Ss]ystem32")) {
         std::string docdir = getenv("HOMEDRIVE");
         docdir += getenv("HOMEPATH");
         docdir += "\\Documents";
diff --git a/tools/offline/scripts/use_git_cache.sh b/tools/offline/scripts/use_git_cache.sh
index 46e42ca7bd..e65c06c8ed 100644
--- a/tools/offline/scripts/use_git_cache.sh
+++ b/tools/offline/scripts/use_git_cache.sh
@@ -8,12 +8,12 @@ fi
 
 export GIT_CONFIG_COUNT=1
 export GIT_CONFIG_KEY_0="url.$GITHUB_PROXY_DIR/.insteadOf"
-export GIT_CONFIG_VALUE_0=git://github.com/
+export GIT_CONFIG_VALUE_0=https://github.com/
 
-echo "Redirecting git://github.com urls to local cache..."
+echo "Redirecting https://github.com urls to local cache..."
 
 function deactivate_git_cache {
-    echo "Removing git://github.com redirect..."
+    echo "Removing https://github.com redirect..."
     unset GIT_CONFIG_COUNT
     unset GIT_CONFIG_KEY_0
     unset GIT_CONFIG_VALUE_0
diff --git a/tools/offline/use_caches.sh b/tools/offline/use_caches.sh
index 71d955fe4c..5db78fd765 100644
--- a/tools/offline/use_caches.sh
+++ b/tools/offline/use_caches.sh
@@ -63,7 +63,7 @@ echo "or"
 echo
 echo "-D LAMMPS_DOWNLOADS_URL=${HTTP_CACHE_URL} -C \"${LAMMPS_HTTP_CACHE_CONFIG}\""
 echo
-echo "pip installations and git clones (from git://) are automatically redirected"
+echo "pip installations and git clones (from https://) are automatically redirected"
 echo
 echo Use 'deactivate_caches' to revert changes
 echo
diff --git a/tools/replica/reorder_remd_traj.py b/tools/replica/reorder_remd_traj.py
index 5033ae1e53..205e5a34f9 100644
--- a/tools/replica/reorder_remd_traj.py
+++ b/tools/replica/reorder_remd_traj.py
@@ -206,7 +206,7 @@ def get_byte_index(rep_inds, byteindfns, intrajfns):
         # close the trajfile object
         fobj.close()
 
-        return
+    return
 
 
 def write_reordered_traj(temp_inds, byte_inds, outtemps, temps,
@@ -459,6 +459,8 @@ if __name__ == "__main__":
 
     # get (unordered) trajectories
     temps = np.loadtxt(tempfn)
+    if not out_temps:
+        out_temps = temps
     ntemps = len(temps)
     intrajfns = ["%s.%d.lammpstrj" % (traj_prefix, k) for k in range(ntemps)]
     # check if the trajs. (or their zipped versions are present)
diff --git a/tools/singularity/README.md b/tools/singularity/README.md
index bd13d75ad7..db7aa9e3b0 100644
--- a/tools/singularity/README.md
+++ b/tools/singularity/README.md
@@ -15,7 +15,7 @@ built CentOS 7.x singularity container.
 
 ```
 cd some/work/directory
-git clone --depth 500  git://github.com/lammps/lammps.git lammps
+git clone --depth 500  https://github.com/lammps/lammps.git lammps
 mkdir build-centos7
 cd build-centos7
 sudo singularity build centos7.sif ../tools/singularity/centos7.def
diff --git a/tools/singularity/centos7.def b/tools/singularity/centos7.def
index 8a3235b58f..f418dc8bf0 100644
--- a/tools/singularity/centos7.def
+++ b/tools/singularity/centos7.def
@@ -11,7 +11,8 @@ From: centos:7
             hdf5-devel python36-virtualenv python36-pip python-pip readline-devel \
             netcdf-devel netcdf-cxx-devel netcdf-mpich-devel netcdf-openmpi-devel \
             python-virtualenv fftw-devel voro++-devel eigen3-devel gsl-devel openblas-devel enchant \
-            blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel zstd libzstd-devel
+            blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel zstd libzstd-devel \
+            yaml-cpp-devel
         yum clean all
 
         # we need to reset any module variables
@@ -36,7 +37,7 @@ From: centos:7
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.7.2
+        version=2.7.3
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/centos8.def b/tools/singularity/centos8.def
index e35f97f453..aa6d998d4e 100644
--- a/tools/singularity/centos8.def
+++ b/tools/singularity/centos8.def
@@ -17,7 +17,7 @@ From: centos:8
                texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
                texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng \
                blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
-               zstd libzstd-devel
+               zstd libzstd-devel yaml-cpp-devel
         dnf clean all
 
         # we need to reset any module variables
@@ -42,7 +42,7 @@ From: centos:8
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.7.2
+        version=2.7.3
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/fedora34_mingw.def b/tools/singularity/fedora34_mingw.def
index 40e6f72861..8a1f108aec 100644
--- a/tools/singularity/fedora34_mingw.def
+++ b/tools/singularity/fedora34_mingw.def
@@ -38,7 +38,7 @@ From: fedora:34
                texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
                texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng texlive-xindy \
                blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
-               zstd libzstd-devel
+               zstd libzstd-devel yaml-cpp-devel
         dnf clean all
 
         # enable Lmod and load MPI
@@ -49,7 +49,7 @@ From: fedora:34
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.7.1
+        version=2.7.3
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/rocky8.def b/tools/singularity/rocky8.def
index 0827b1d548..ea74a50339 100644
--- a/tools/singularity/rocky8.def
+++ b/tools/singularity/rocky8.def
@@ -17,6 +17,7 @@ From: rockylinux/rockylinux:8
                texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
                texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng \
                blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
+               yaml-cpp-devel \
                zstd libzstd-devel
         dnf clean all
 
@@ -42,7 +43,7 @@ From: rockylinux/rockylinux:8
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.7.2
+        version=2.7.3
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/ubuntu18.04.def b/tools/singularity/ubuntu18.04.def
index 35247d8e2a..1364b8cc4b 100644
--- a/tools/singularity/ubuntu18.04.def
+++ b/tools/singularity/ubuntu18.04.def
@@ -75,6 +75,7 @@ From: ubuntu:18.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -105,7 +106,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_amd_rocm.def b/tools/singularity/ubuntu18.04_amd_rocm.def
index 407cda1250..a99f4931ef 100644
--- a/tools/singularity/ubuntu18.04_amd_rocm.def
+++ b/tools/singularity/ubuntu18.04_amd_rocm.def
@@ -3,16 +3,29 @@ From: ubuntu:18.04
 
 %environment
     export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.5.0/llvm/lib
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
     apt-get upgrade --no-install-recommends -y
 
-    apt-get install -y --no-install-recommends curl libnuma-dev gnupg
+    apt-get install -y --no-install-recommends curl wget libnuma-dev gnupg ca-certificates
+    apt-get install --no-install-recommends -y software-properties-common
 
-    curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add -
-    printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main" > /etc/apt/sources.list.d/rocm.list
+    ###########################################################################
+    # Latest CMake
+    ###########################################################################
+
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+    apt update
+    apt install -y cmake
+
+    ###########################################################################
+    # ROCm 4.5
+    ###########################################################################
+    wget https://repo.radeon.com/amdgpu-install/21.40/ubuntu/focal/amdgpu-install-21.40.40500-1_all.deb
+    apt-get install -y ./amdgpu-install-21.40.40500-1_all.deb
 
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -20,11 +33,18 @@ From: ubuntu:18.04
         file \
         sudo \
         libelf1 \
-        rocm-dev \
-        rocm-libs \
         build-essential
 
-    apt-get install --no-install-recommends -y software-properties-common
+    amdgpu-install --usecase=rocm --no-dkms -y
+
+    ###########################################################################
+    # ROCm hipCUB
+    ###########################################################################
+    apt-get install -y hipcub-dev
+
+    ###########################################################################
+    # Common Software
+    ###########################################################################
     add-apt-repository ppa:openkim/latest
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -85,24 +105,11 @@ From: ubuntu:18.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
 
-    ###########################################################################
-    # ROCm hipCUB
-    ###########################################################################
-
-    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
-    mkdir hipCUB/build
-    cd hipCUB/build
-    CXX=hipcc cmake -D BUILD_TEST=off ..
-    make
-    make package
-    make install
-
-
     ###########################################################################
     # KIM-API
     ###########################################################################
@@ -129,7 +136,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_gpu.def b/tools/singularity/ubuntu18.04_gpu.def
index 9a1d37792e..cc02280d8a 100644
--- a/tools/singularity/ubuntu18.04_gpu.def
+++ b/tools/singularity/ubuntu18.04_gpu.def
@@ -2,20 +2,33 @@ BootStrap: docker
 From: ubuntu:18.04
 
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.4
-    export CUDA_PATH=/usr/local/cuda-11.4
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
-    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.5/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export CUDADIR=/usr/local/cuda-11.5
+    export CUDA_PATH=/usr/local/cuda-11.5
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.5/lib64:/opt/rocm/lib:/opt/rocm-4.5.0/llvm/lib
+    export LIBRARY_PATH=/usr/local/cuda-11.5/lib64/stubs
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
     apt-get upgrade --no-install-recommends -y
 
-    apt-get install -y --no-install-recommends curl libnuma-dev gnupg
+    apt-get install -y --no-install-recommends curl wget libnuma-dev gnupg ca-certificates
+    apt-get install --no-install-recommends -y software-properties-common
 
-    curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add -
-    printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main" > /etc/apt/sources.list.d/rocm.list
+    ###########################################################################
+    # Latest CMake
+    ###########################################################################
+
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+    apt update
+    apt install -y cmake
+
+    ###########################################################################
+    # ROCm 4.5
+    ###########################################################################
+    wget https://repo.radeon.com/amdgpu-install/21.40/ubuntu/focal/amdgpu-install-21.40.40500-1_all.deb
+    apt-get install -y ./amdgpu-install-21.40.40500-1_all.deb
 
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -23,11 +36,18 @@ From: ubuntu:18.04
         file \
         sudo \
         libelf1 \
-        rocm-dev \
-        rocm-libs \
         build-essential
 
-    apt-get install --no-install-recommends -y software-properties-common
+    amdgpu-install --usecase=rocm --no-dkms -y
+
+    ###########################################################################
+    # ROCm hipCUB
+    ###########################################################################
+    apt-get install -y hipcub-dev
+
+    ###########################################################################
+    # Common Software
+    ###########################################################################
     add-apt-repository ppa:openkim/latest
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -90,6 +110,7 @@ From: ubuntu:18.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -104,7 +125,7 @@ From: ubuntu:18.04
     add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
     apt-get update
 
-    export CUDA_PKG_VERSION=11.4
+    export CUDA_PKG_VERSION=11.5
 
     apt-get install -y --no-install-recommends \
         cuda-libraries-${CUDA_PKG_VERSION} \
@@ -125,19 +146,6 @@ From: ubuntu:18.04
     mkdir -p /etc/OpenCL/vendors
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
 
-    ###########################################################################
-    # ROCm hipCUB
-    ###########################################################################
-
-    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
-    mkdir hipCUB/build
-    cd hipCUB/build
-    CXX=hipcc cmake -D BUILD_TEST=off ..
-    make
-    make package
-    make install
-
 
     ###########################################################################
     # KIM-API
@@ -165,7 +173,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_intel_opencl.def b/tools/singularity/ubuntu18.04_intel_opencl.def
index 95c744c67d..3bb8b990d1 100644
--- a/tools/singularity/ubuntu18.04_intel_opencl.def
+++ b/tools/singularity/ubuntu18.04_intel_opencl.def
@@ -69,6 +69,7 @@ From: ubuntu:18.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -106,7 +107,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_nvidia.def b/tools/singularity/ubuntu18.04_nvidia.def
index 359e1d1c4d..a97e4a52ec 100644
--- a/tools/singularity/ubuntu18.04_nvidia.def
+++ b/tools/singularity/ubuntu18.04_nvidia.def
@@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.4.1-devel-ubuntu18.04
+From: nvidia/cuda:11.4.2-devel-ubuntu18.04
 
 %post
     export DEBIAN_FRONTEND=noninteractive
@@ -69,6 +69,7 @@ From: nvidia/cuda:11.4.1-devel-ubuntu18.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -105,7 +106,7 @@ From: nvidia/cuda:11.4.1-devel-ubuntu18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04.def b/tools/singularity/ubuntu20.04.def
index f85d3ca614..53690fd1b9 100644
--- a/tools/singularity/ubuntu20.04.def
+++ b/tools/singularity/ubuntu20.04.def
@@ -71,6 +71,7 @@ From: ubuntu:20.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -100,7 +101,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_amd_rocm.def b/tools/singularity/ubuntu20.04_amd_rocm.def
index 2b4176f183..e52ff02798 100644
--- a/tools/singularity/ubuntu20.04_amd_rocm.def
+++ b/tools/singularity/ubuntu20.04_amd_rocm.def
@@ -3,17 +3,19 @@ From: ubuntu:20.04
 
 %environment
     export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.5.0/llvm/lib
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
     apt-get upgrade --no-install-recommends -y
 
-    apt-get install -y --no-install-recommends curl libnuma-dev gnupg
+    apt-get install -y --no-install-recommends curl wget libnuma-dev gnupg ca-certificates
 
-    curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add -
-    # AMD is using xenial folder also for focal
-    printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main" > /etc/apt/sources.list.d/rocm.list
+    ###########################################################################
+    # ROCm 4.5
+    ###########################################################################
+    wget https://repo.radeon.com/amdgpu-install/21.40/ubuntu/focal/amdgpu-install-21.40.40500-1_all.deb
+    apt-get install -y ./amdgpu-install-21.40.40500-1_all.deb
 
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -21,10 +23,18 @@ From: ubuntu:20.04
         file \
         sudo \
         libelf1 \
-        rocm-dev \
-        rocm-libs \
         build-essential
 
+    amdgpu-install --usecase=rocm --no-dkms -y
+
+    ###########################################################################
+    # ROCm hipCUB
+    ###########################################################################
+    apt-get install -y hipcub-dev
+
+    ###########################################################################
+    # Common Software
+    ###########################################################################
     apt-get install --no-install-recommends -y software-properties-common
     add-apt-repository ppa:openkim/latest
     apt-get update
@@ -82,24 +92,10 @@ From: ubuntu:20.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
-
-    ###########################################################################
-    # ROCm hipCUB
-    ###########################################################################
-
-    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
-    mkdir hipCUB/build
-    cd hipCUB/build
-    CXX=hipcc cmake -D BUILD_TEST=off ..
-    make
-    make package
-    make install
-
-
     ###########################################################################
     # KIM-API
     ###########################################################################
@@ -126,7 +122,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_gpu.def b/tools/singularity/ubuntu20.04_gpu.def
index 3ea759078b..abb43f46df 100644
--- a/tools/singularity/ubuntu20.04_gpu.def
+++ b/tools/singularity/ubuntu20.04_gpu.def
@@ -2,21 +2,23 @@ BootStrap: docker
 From: ubuntu:20.04
 
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.4
-    export CUDA_PATH=/usr/local/cuda-11.4
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
-    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.5/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export CUDADIR=/usr/local/cuda-11.5
+    export CUDA_PATH=/usr/local/cuda-11.5
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.5/lib64:/opt/rocm/lib:/opt/rocm-4.5.0/llvm/lib
+    export LIBRARY_PATH=/usr/local/cuda-11.5/lib64/stubs
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
     apt-get upgrade --no-install-recommends -y
 
-    apt-get install -y --no-install-recommends curl libnuma-dev gnupg
+    apt-get install -y --no-install-recommends curl wget libnuma-dev gnupg ca-certificates
 
-    curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add -
-    # AMD is using xenial folder also for focal
-    printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main" > /etc/apt/sources.list.d/rocm.list
+    ###########################################################################
+    # ROCm 4.5
+    ###########################################################################
+    wget https://repo.radeon.com/amdgpu-install/21.40/ubuntu/focal/amdgpu-install-21.40.40500-1_all.deb
+    apt-get install -y ./amdgpu-install-21.40.40500-1_all.deb
 
     apt-get update
     apt-get install --no-install-recommends -y \
@@ -24,10 +26,18 @@ From: ubuntu:20.04
         file \
         sudo \
         libelf1 \
-        rocm-dev \
-        rocm-libs \
         build-essential
 
+    amdgpu-install --usecase=rocm --no-dkms -y
+
+    ###########################################################################
+    # ROCm hipCUB
+    ###########################################################################
+    apt-get install -y hipcub-dev
+
+    ###########################################################################
+    # Common Software
+    ###########################################################################
     apt-get install --no-install-recommends -y software-properties-common
     add-apt-repository ppa:openkim/latest
     apt-get update
@@ -87,6 +97,7 @@ From: ubuntu:20.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -101,7 +112,7 @@ From: ubuntu:20.04
     add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
     apt-get update
 
-    export CUDA_PKG_VERSION=11.4
+    export CUDA_PKG_VERSION=11.5
 
     apt-get install -y --no-install-recommends \
         cuda-libraries-${CUDA_PKG_VERSION} \
@@ -122,19 +133,6 @@ From: ubuntu:20.04
     mkdir -p /etc/OpenCL/vendors
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
 
-    ###########################################################################
-    # ROCm hipCUB
-    ###########################################################################
-
-    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
-    mkdir hipCUB/build
-    cd hipCUB/build
-    CXX=hipcc cmake -D BUILD_TEST=off ..
-    make
-    make package
-    make install
-
 
     ###########################################################################
     # KIM-API
@@ -162,7 +160,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_intel_opencl.def b/tools/singularity/ubuntu20.04_intel_opencl.def
index 7c83ecb5b1..0e3cae67d7 100644
--- a/tools/singularity/ubuntu20.04_intel_opencl.def
+++ b/tools/singularity/ubuntu20.04_intel_opencl.def
@@ -62,6 +62,7 @@ From: ubuntu:20.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
@@ -99,7 +100,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_nvidia.def b/tools/singularity/ubuntu20.04_nvidia.def
index ddcbd34db9..274efecadc 100644
--- a/tools/singularity/ubuntu20.04_nvidia.def
+++ b/tools/singularity/ubuntu20.04_nvidia.def
@@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.4.1-devel-ubuntu20.04
+From: nvidia/cuda:11.4.2-devel-ubuntu20.04
 
 %post
     export DEBIAN_FRONTEND=noninteractive
@@ -34,6 +34,7 @@ From: nvidia/cuda:11.4.1-devel-ubuntu20.04
         libhwloc-dev \
         libjpeg-dev \
         liblapack-dev \
+        libnetcdf-dev \
         libomp-dev \
         libopenblas-dev \
         libnuma-dev \
@@ -64,10 +65,10 @@ From: nvidia/cuda:11.4.1-devel-ubuntu20.04
         valgrind \
         gdb \
         zstd \
+        libyaml-cpp-dev \
         libkim-api-dev \
         openkim-models
 
-
     ###########################################################################
     # NVIDIA OpenCL
     ###########################################################################
@@ -75,7 +76,6 @@ From: nvidia/cuda:11.4.1-devel-ubuntu20.04
     mkdir -p /etc/OpenCL/vendors
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
 
-
     ###########################################################################
     # KIM-API
     ###########################################################################
@@ -102,7 +102,7 @@ From: nvidia/cuda:11.4.1-devel-ubuntu20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.7.2
+    export PLUMED_PKG_VERSION=2.7.3
 
     mkdir plumed
     cd plumed
@@ -115,7 +115,6 @@ From: nvidia/cuda:11.4.1-devel-ubuntu20.04
     cd ../../
     rm -rvf plumed
 
-
     ###########################################################################
     # Customizations
     ###########################################################################
diff --git a/tools/singularity/ubuntu20.04_oneapi.def b/tools/singularity/ubuntu20.04_oneapi.def
new file mode 100644
index 0000000000..e5077fe184
--- /dev/null
+++ b/tools/singularity/ubuntu20.04_oneapi.def
@@ -0,0 +1,185 @@
+BootStrap: docker
+From: ubuntu:20.04
+
+%post
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update
+    apt-get install --no-install-recommends -y software-properties-common
+    add-apt-repository ppa:openkim/latest
+    apt-get update
+    apt-get upgrade --no-install-recommends -y
+    apt-get install --no-install-recommends -y \
+        gpg-agent \
+        bc \
+        build-essential \
+        ccache \
+        clang \
+        curl \
+        doxygen \
+        enchant \
+        g++ \
+        gcc \
+        gfortran \
+        git \
+        hdf5-tools \
+        less \
+        libblas-dev \
+        libeigen3-dev \
+        libenchant-dev \
+        libfftw3-dev \
+        libgsl-dev \
+        libhdf5-serial-dev \
+        libhwloc-dev \
+        libjpeg-dev \
+        liblapack-dev \
+        libnetcdf-dev \
+        libomp-dev \
+        libopenblas-dev \
+        libnuma-dev \
+        libpng-dev \
+        libproj-dev \
+        libreadline-dev \
+        libvtk6-dev \
+        libyaml-dev \
+        libzstd-dev \
+        make \
+        mpi-default-bin \
+        mpi-default-dev \
+        ninja-build \
+        python3-dev \
+        python3-pip \
+        python3-pkg-resources \
+        python3-setuptools \
+        python3-virtualenv \
+        rsync \
+        ssh \
+        texlive \
+        texlive-latex-recommended \
+        texlive-formats-extra \
+        texlive-pictures \
+        texlive-publishers \
+        texlive-science \
+        dvipng \
+        latexmk \
+        xindy \
+        vim-nox \
+        virtualenv \
+        voro++-dev \
+        wget \
+        xxd \
+        valgrind \
+        gdb \
+        zstd \
+        libyaml-cpp-dev \
+        libkim-api-dev \
+        openkim-models
+
+    ###########################################################################
+    # Latest CMake (needed for OneAPI)
+    ###########################################################################
+
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main'
+    apt update
+    apt install -y cmake
+
+    ###########################################################################
+    # OneAPI
+    ###########################################################################
+
+    cd /tmp
+    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+    # add to your apt sources keyring so that archives signed with this key will be trusted.
+    apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+    # remove the public key
+    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+    echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+    add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+    apt update
+    apt install -y \
+        intel-oneapi-compiler-dpcpp-cpp \
+        intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic \
+        intel-oneapi-compiler-fortran \
+        intel-oneapi-dev-utilities \
+        intel-oneapi-openmp \
+        intel-oneapi-ipp-devel \
+        intel-oneapi-mpi-devel \
+        intel-oneapi-tbb-devel \
+        intel-oneapi-ippcp-devel \
+        intel-oneapi-ccl-devel \
+        intel-oneapi-dnnl-devel \
+        intel-oneapi-onevpl-devel \
+        intel-oneapi-dal-devel \
+        intel-oneapi-mkl-devel \
+        intel-oneapi-libdpstd-devel
+
+    ###########################################################################
+    # KIM-API
+    ###########################################################################
+
+    # workaround for installing files in /usr/share/doc inside of a container
+    sed -i 's/path-exclude=\/usr\/share\/doc/#path-exclude=\/usr\/share\/doc/g' /etc/dpkg/dpkg.cfg.d/excludes
+    apt-get install -y libkim-api-doc
+    sed -i 's/#path-exclude=\/usr\/share\/doc/path-exclude=\/usr\/share\/doc/g' /etc/dpkg/dpkg.cfg.d/excludes
+
+    # install KIM models
+    KIM_API_EXAMPLES=/usr/share/doc/libkim-api-dev/examples
+    gunzip $KIM_API_EXAMPLES/portable-models/LennardJones612_UniversalShifted__MO_959249795837_003/LennardJones612_UniversalShifted.params.gz
+    gunzip $KIM_API_EXAMPLES/model-drivers/ex_model_driver_P_LJ/ex_model_driver_P_LJ.f90.gz
+
+    kim-api-collections-management install system $KIM_API_EXAMPLES/model-drivers/LennardJones612__MD_414112407348_003
+    kim-api-collections-management install system $KIM_API_EXAMPLES/model-drivers/ex_model_driver_P_LJ
+    kim-api-collections-management install system $KIM_API_EXAMPLES/portable-models/LennardJones_Ar
+    kim-api-collections-management install system $KIM_API_EXAMPLES/portable-models/ex_model_Ar_P_LJ
+    kim-api-collections-management install system $KIM_API_EXAMPLES/portable-models/LennardJones612_UniversalShifted__MO_959249795837_003
+    kim-api-collections-management install system $KIM_API_EXAMPLES/simulator-models/Sim_LAMMPS_LJcut_AkersonElliott_Alchemy_PbAu
+
+
+    ###########################################################################
+    # Plumed
+    ###########################################################################
+
+    export PLUMED_PKG_VERSION=2.7.3
+
+    mkdir plumed
+    cd plumed
+    curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${PLUMED_PKG_VERSION}/plumed-src-${PLUMED_PKG_VERSION}.tgz
+    tar -xzf plumed.tar.gz
+    cd plumed-${PLUMED_PKG_VERSION}
+    ./configure --disable-doc --prefix=/usr
+    make
+    make install
+    cd ../../
+    rm -rvf plumed
+
+    ###########################################################################
+    # Customizations
+    ###########################################################################
+
+    # set custom prompt indicating the container name
+    CUSTOM_PROMPT_ENV=/.singularity.d/env/99-zz_custom_prompt.sh
+    cat >$CUSTOM_PROMPT_ENV <<EOF
+#!/bin/bash
+PS1="[ubuntu20.04_oneapi:\u@\h] \W> "
+EOF
+    chmod 755 $CUSTOM_PROMPT_ENV
+
+
+    ###########################################################################
+    # Cleanup
+    ###########################################################################
+    # clean cache
+    rm -rf /var/lib/apt/lists/*
+
+%environment
+    LC_ALL=C
+    export LC_ALL
+    export PATH=/usr/lib/ccache:$PATH
+    # tell OpenMPI to not try using Infiniband
+    OMPI_MCA_btl="^openib"
+    # do not warn about unused components as this messes up testing
+    OMPI_MCA_btl_base_warn_component_unused="0"
+    export OMPI_MCA_btl OMPI_MCA_btl_base_warn_component_unused
+
+%labels
+    Author akohlmey, rbberger
diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt
index 46f1865989..6489287097 100644
--- a/unittest/CMakeLists.txt
+++ b/unittest/CMakeLists.txt
@@ -1,4 +1,26 @@
-include(GTest)
+########################################
+# CMake build for automated testing
+# This file is part of LAMMPS
+# Created by Axel Kohlmeyer and Richard Berger
+########################################
+# download and build googletest framework
+message(STATUS "Downloading and building googletest framework")
+set(GTEST_URL "https://github.com/google/googletest/archive/release-1.11.0.tar.gz" CACHE STRING "URL of googletest source")
+set(GTEST_MD5 "e8a8df240b6938bb6384155d4c37d937" CACHE STRING "MD5 sum for googletest source")
+mark_as_advanced(GTEST_URL)
+mark_as_advanced(GTEST_MD5)
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(ExternalCMakeProject)
+ExternalCMakeProject(googletest ${GTEST_URL} ${GTEST_MD5} googletest . "")
+add_library(GTest::GTest ALIAS gtest)
+add_library(GTest::GMock ALIAS gmock)
+add_library(GTest::GTestMain ALIAS gtest_main)
+add_library(GTest::GMockMain ALIAS gmock_main)
+
+########################################
+# General tests using the LAMMPS executable itself
+########################################
 
 # check if we can run the compiled executable and whether it prints
 # the LAMMPS version header in the output for an empty input
@@ -7,7 +29,7 @@ add_test(NAME RunLammps
          COMMAND $<TARGET_FILE:lmp> -log none -echo none -in in.empty
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(RunLammps PROPERTIES
-	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
+        ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
         PASS_REGULAR_EXPRESSION "LAMMPS \\([0-9]+ [A-Za-z]+ 2[0-9][0-9][0-9]( - Update [0-9]+)?\\)")
 
 # check if the compiled executable will print the help message
@@ -15,7 +37,7 @@ add_test(NAME HelpMessage
          COMMAND $<TARGET_FILE:lmp> -h
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(HelpMessage PROPERTIES
-	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
+        ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
          PASS_REGULAR_EXPRESSION ".*Large-scale Atomic/Molecular Massively Parallel Simulator -.*Usage example:.*")
 
 # check if the compiled executable will error out on an invalid command line flag
@@ -23,25 +45,36 @@ add_test(NAME InvalidFlag
          COMMAND $<TARGET_FILE:lmp> -xxx
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(InvalidFlag PROPERTIES
-	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
+        ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
          PASS_REGULAR_EXPRESSION "ERROR: Invalid command-line argument.*")
 
+# convenience function for adding tests requiring to be run in parallel with MPI
 if(BUILD_MPI)
   function(add_mpi_test)
     set(MPI_TEST_NUM_PROCS 1)
     set(MPI_TEST_WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     cmake_parse_arguments(MPI_TEST "" "NAME;NUM_PROCS;WORKING_DIRECTORY" "COMMAND" ${ARGN})
+    # Do not add test when oversubscribing
+    if(MPI_TEST_NUMPROCS GREATER MPIEXEC_MAX_NUMPROCS)
+      return()
+    endif()
     list(GET MPI_TEST_COMMAND 0 EXECUTABLE)
     list(REMOVE_AT MPI_TEST_COMMAND 0)
     set(ARGS ${MPI_TEST_COMMAND})
     add_test(NAME ${MPI_TEST_NAME}
              WORKING_DIRECTORY ${MPI_TEST_WORKING_DIRECTORY}
-             COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPI_TEST_NUM_PROCS} ${MPIEXEC_PREFLAGS}
-                     ${EXECUTABLE} ${MPIEXEC_POSTFLAGS} ${ARGS}
-    )
+             COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPI_TEST_NUM_PROCS}
+                     ${MPIEXEC_PREFLAGS} ${EXECUTABLE} ${MPIEXEC_POSTFLAGS} ${ARGS})
+  endfunction()
+else()
+  function(add_mpi_test)
+    cmake_parse_arguments(MPI_TEST "" "NAME;NUM_PROCS;WORKING_DIRECTORY" "COMMAND" ${ARGN})
+    message(STATUS "Skipping test ${NAME} on non-MPI compilation")
   endfunction()
 endif()
 
+# incorporate categories of specific tests from subdirectories
+
 add_subdirectory(utils)
 add_subdirectory(formats)
 add_subdirectory(commands)
@@ -52,6 +85,8 @@ add_subdirectory(python)
 add_subdirectory(tools)
 add_subdirectory(force-styles)
 
+# clang-format support for test sources
+
 find_package(ClangFormat 8.0)
 
 if(ClangFormat_FOUND)
diff --git a/unittest/c-library/CMakeLists.txt b/unittest/c-library/CMakeLists.txt
index ee7f323c0f..3d57dbbc90 100644
--- a/unittest/c-library/CMakeLists.txt
+++ b/unittest/c-library/CMakeLists.txt
@@ -1,24 +1,24 @@
 
 add_executable(test_library_open test_library_open.cpp test_main.cpp)
-target_link_libraries(test_library_open PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_open PRIVATE lammps GTest::GMock)
 add_test(LibraryOpen test_library_open)
 
 add_executable(test_library_commands test_library_commands.cpp test_main.cpp)
-target_link_libraries(test_library_commands PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_commands PRIVATE lammps GTest::GMock)
 add_test(LibraryCommands test_library_commands)
 
 add_executable(test_library_external test_library_external.cpp test_main.cpp)
-target_link_libraries(test_library_external PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_external PRIVATE lammps GTest::GMock)
 add_test(LibraryExternal test_library_external)
 
 add_executable(test_library_properties test_library_properties.cpp test_main.cpp)
-target_link_libraries(test_library_properties PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_properties PRIVATE lammps GTest::GMock)
 target_compile_definitions(test_library_properties PRIVATE -DTEST_INPUT_FOLDER=${CMAKE_CURRENT_SOURCE_DIR})
 add_test(LibraryProperties test_library_properties)
 set_tests_properties(LibraryProperties PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 add_executable(test_library_scatter_gather test_library_scatter_gather.cpp test_main.cpp)
-target_link_libraries(test_library_scatter_gather PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_scatter_gather PRIVATE lammps GTest::GMock)
 target_compile_definitions(test_library_scatter_gather PRIVATE -DTEST_INPUT_FOLDER=${CMAKE_CURRENT_SOURCE_DIR})
 add_test(LibraryScatterGather test_library_scatter_gather)
 set_tests_properties(LibraryScatterGather PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
@@ -63,13 +63,11 @@ foreach(WITH "JPEG" "PNG" "GZIP" "FFMPEG")
 endforeach()
 
 add_executable(test_library_config test_library_config.cpp test_main.cpp)
-target_link_libraries(test_library_config PRIVATE lammps GTest::GTest GTest::GMock)
+target_link_libraries(test_library_config PRIVATE lammps GTest::GMock)
 target_compile_definitions(test_library_config PRIVATE ${TEST_CONFIG_DEFS})
 add_test(LibraryConfig test_library_config)
 
-if(BUILD_MPI)
-  add_executable(test_library_mpi test_library_mpi.cpp)
-  target_link_libraries(test_library_mpi PRIVATE lammps GTest::GTest GTest::GMock)
-  target_compile_definitions(test_library_mpi PRIVATE ${TEST_CONFIG_DEFS})
-  add_mpi_test(NAME LibraryMPI NUM_PROCS 4 COMMAND $<TARGET_FILE:test_library_mpi>)
-endif()
+add_executable(test_library_mpi test_library_mpi.cpp)
+target_link_libraries(test_library_mpi PRIVATE lammps GTest::GMock)
+target_compile_definitions(test_library_mpi PRIVATE ${TEST_CONFIG_DEFS})
+add_mpi_test(NAME LibraryMPI NUM_PROCS 4 COMMAND $<TARGET_FILE:test_library_mpi>)
diff --git a/unittest/c-library/test_library_commands.cpp b/unittest/c-library/test_library_commands.cpp
index b16dc6e9dd..203862c696 100644
--- a/unittest/c-library/test_library_commands.cpp
+++ b/unittest/c-library/test_library_commands.cpp
@@ -2,6 +2,7 @@
 
 #include "lammps.h"
 #include "library.h"
+#include "platform.h"
 #include <string>
 
 #include "gmock/gmock.h"
@@ -76,8 +77,8 @@ TEST_F(LibraryCommands, from_file)
     if (!verbose) ::testing::internal::GetCapturedStdout();
     EXPECT_EQ(lammps_get_natoms(lmp), 2);
 
-    unlink(demo_file);
-    unlink(cont_file);
+    LAMMPS_NS::platform::unlink(demo_file);
+    LAMMPS_NS::platform::unlink(cont_file);
 };
 
 TEST_F(LibraryCommands, from_line)
diff --git a/unittest/c-library/test_library_mpi.cpp b/unittest/c-library/test_library_mpi.cpp
index 6fdec7b7e4..1609107ae0 100644
--- a/unittest/c-library/test_library_mpi.cpp
+++ b/unittest/c-library/test_library_mpi.cpp
@@ -167,7 +167,6 @@ TEST(MPI, split_comm)
 
 TEST(MPI, multi_partition)
 {
-    FILE *fp;
     int nprocs, me;
     lammps_mpi_init();
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
diff --git a/unittest/commands/CMakeLists.txt b/unittest/commands/CMakeLists.txt
index 176e22a391..49603a8b22 100644
--- a/unittest/commands/CMakeLists.txt
+++ b/unittest/commands/CMakeLists.txt
@@ -1,44 +1,28 @@
 
-# build LAMMPS plugins, but not on Windows
-if((NOT (CMAKE_SYSTEM_NAME STREQUAL "Windows")) AND PKG_PLUGIN)
-  ExternalProject_Add(plugins
-                      SOURCE_DIR      "${LAMMPS_DIR}/examples/plugins"
-                      BINARY_DIR      ${CMAKE_BINARY_DIR}/build-plugins
-                      INSTALL_DIR     ${CMAKE_BINARY_DIR}
-                      CMAKE_ARGS      ${CMAKE_REQUEST_PIC}
-                                      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                                      -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-                                      -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                                      -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
-                                      -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-                     BUILD_BYPRODUCTS <BINARY_DIR>/morse2plugin${CMAKE_SHARED_MODULE_SUFFIX}
-                                      <BINARY_DIR>/nve2plugin${CMAKE_SHARED_MODULE_SUFFIX}
-                                      <BINARY_DIR>/helloplugin${CMAKE_SHARED_MODULE_SUFFIX}
-                     INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                                      <BINARY_DIR>/morse2plugin${CMAKE_SHARED_MODULE_SUFFIX}
-                                      <BINARY_DIR>/nve2plugin${CMAKE_SHARED_MODULE_SUFFIX}
-                                      <BINARY_DIR>/helloplugin${CMAKE_SHARED_MODULE_SUFFIX}
-                                      ${CMAKE_CURRENT_BINARY_DIR}
-                     TEST_COMMAND    "")
-endif()
-
 add_executable(test_simple_commands test_simple_commands.cpp)
-if(PKG_PLUGIN)
+
+# tests for the plugin command require the PLUGIN package and won't work on windows
+if((NOT (CMAKE_SYSTEM_NAME STREQUAL "Windows")) AND PKG_PLUGIN)
+  add_subdirectory(${LAMMPS_DIR}/examples/plugins ${CMAKE_BINARY_DIR}/build-plugins)
   add_dependencies(test_simple_commands plugins)
+  target_compile_definitions(test_simple_commands PRIVATE -DLMP_PLUGIN)
 endif()
-target_link_libraries(test_simple_commands PRIVATE lammps GTest::GMock GTest::GTest)
+
+target_link_libraries(test_simple_commands PRIVATE lammps GTest::GMock)
 add_test(NAME SimpleCommands COMMAND test_simple_commands WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+set_tests_properties(SimpleCommands PROPERTIES
+          ENVIRONMENT "LAMMPS_PLUGIN_BIN_DIR=${CMAKE_BINARY_DIR}/build-plugins")
 
 add_executable(test_lattice_region test_lattice_region.cpp)
-target_link_libraries(test_lattice_region PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_lattice_region PRIVATE lammps GTest::GMock)
 add_test(NAME LatticeRegion COMMAND test_lattice_region WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_groups test_groups.cpp)
-target_link_libraries(test_groups PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_groups PRIVATE lammps GTest::GMock)
 add_test(NAME Groups COMMAND test_groups WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_variables test_variables.cpp)
-target_link_libraries(test_variables PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_variables PRIVATE lammps GTest::GMock)
 add_test(NAME Variables COMMAND test_variables WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_kim_commands test_kim_commands.cpp)
@@ -49,17 +33,15 @@ if(KIM_EXTRA_UNITTESTS)
     message(FATAL_ERROR "CURL not found. Enabling KIM extra unit tests requires to have libcurl installed.")
   endif()
 endif()
-target_link_libraries(test_kim_commands PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_kim_commands PRIVATE lammps GTest::GMock)
 add_test(NAME KimCommands COMMAND test_kim_commands WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_reset_ids test_reset_ids.cpp)
 target_compile_definitions(test_reset_ids PRIVATE -DTEST_INPUT_FOLDER=${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(test_reset_ids PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_reset_ids PRIVATE lammps GTest::GMock)
 add_test(NAME ResetIDs COMMAND test_reset_ids WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-if(BUILD_MPI)
-  add_executable(test_mpi_load_balancing test_mpi_load_balancing.cpp)
-  target_link_libraries(test_mpi_load_balancing PRIVATE lammps GTest::GTest GTest::GMock)
-  target_compile_definitions(test_mpi_load_balancing PRIVATE ${TEST_CONFIG_DEFS})
-  add_mpi_test(NAME MPILoadBalancing NUM_PROCS 4 COMMAND $<TARGET_FILE:test_mpi_load_balancing>)
-endif()
+add_executable(test_mpi_load_balancing test_mpi_load_balancing.cpp)
+target_link_libraries(test_mpi_load_balancing PRIVATE lammps GTest::GMock)
+target_compile_definitions(test_mpi_load_balancing PRIVATE ${TEST_CONFIG_DEFS})
+add_mpi_test(NAME MPILoadBalancing NUM_PROCS 4 COMMAND $<TARGET_FILE:test_mpi_load_balancing>)
diff --git a/unittest/commands/test_groups.cpp b/unittest/commands/test_groups.cpp
index 0c8a7cd83c..9d69b5412e 100644
--- a/unittest/commands/test_groups.cpp
+++ b/unittest/commands/test_groups.cpp
@@ -204,7 +204,7 @@ TEST_F(GroupTest, SelectRestart)
     command("write_restart group.restart");
     command("clear");
     command("read_restart group.restart");
-    unlink("group.restart");
+    platform::unlink("group.restart");
     END_HIDE_OUTPUT();
     group = lmp->group;
     ASSERT_EQ(group->count(group->find("one")), 16);
@@ -246,7 +246,7 @@ TEST_F(GroupTest, Molecular)
     ASSERT_DOUBLE_EQ(group->mass(group->find("half")), 40);
     ASSERT_DOUBLE_EQ(group->mass(group->find("half"), domain->find_region("top")), 10);
     ASSERT_NEAR(group->charge(group->find("top")), 0, 1.0e-14);
-    ASSERT_DOUBLE_EQ(group->charge(group->find("right"), domain->find_region("top")), 0);
+    ASSERT_NEAR(group->charge(group->find("right"), domain->find_region("top")), 0, 1.0e-14);
 
     TEST_FAILURE(".*ERROR: Illegal group command.*", command("group three include xxx"););
 }
diff --git a/unittest/commands/test_lattice_region.cpp b/unittest/commands/test_lattice_region.cpp
index 47b69a15e0..1bbda21001 100644
--- a/unittest/commands/test_lattice_region.cpp
+++ b/unittest/commands/test_lattice_region.cpp
@@ -82,7 +82,7 @@ TEST_F(LatticeRegionTest, lattice_sc)
     BEGIN_CAPTURE_OUTPUT();
     command("lattice sc 1.0 spacing 1.5 2.0 3.0");
     auto output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, MatchesRegex(".*Lattice spacing in x,y,z = 1.50* 2.0* 3.0*.*"));
+    ASSERT_THAT(output, MatchesRegex(".*Lattice spacing in x,y,z = 1.5.* 2.* 3.*"));
 
     auto lattice = lmp->domain->lattice;
     ASSERT_EQ(lattice->xlattice, 1.5);
@@ -92,7 +92,7 @@ TEST_F(LatticeRegionTest, lattice_sc)
     BEGIN_CAPTURE_OUTPUT();
     command("lattice sc 2.0");
     output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, MatchesRegex(".*Lattice spacing in x,y,z = 2.0* 2.0* 2.0*.*"));
+    ASSERT_THAT(output, MatchesRegex(".*Lattice spacing in x,y,z = 2.* 2.* 2.*"));
 
     lattice = lmp->domain->lattice;
     ASSERT_EQ(lattice->style, Lattice::SC);
diff --git a/unittest/commands/test_simple_commands.cpp b/unittest/commands/test_simple_commands.cpp
index 0ad47f4e96..1844752d33 100644
--- a/unittest/commands/test_simple_commands.cpp
+++ b/unittest/commands/test_simple_commands.cpp
@@ -384,7 +384,12 @@ TEST_F(SimpleCommandsTest, Units)
 #if defined(LMP_PLUGIN)
 TEST_F(SimpleCommandsTest, Plugin)
 {
-    std::string loadfmt("plugin load {}plugin.so");
+    const char *bindir = getenv("LAMMPS_PLUGIN_BIN_DIR");
+    const char *config = getenv("CMAKE_CONFIG_TYPE");
+    if (!bindir) GTEST_SKIP();
+    std::string loadfmt = platform::path_join("plugin load ", bindir);
+    if (config) loadfmt = platform::path_join(loadfmt, config);
+    loadfmt = platform::path_join(loadfmt, "{}plugin.so");
     ::testing::internal::CaptureStdout();
     lmp->input->one(fmt::format(loadfmt, "hello"));
     auto text = ::testing::internal::GetCapturedStdout();
@@ -395,7 +400,7 @@ TEST_F(SimpleCommandsTest, Plugin)
     lmp->input->one(fmt::format(loadfmt, "xxx"));
     text = ::testing::internal::GetCapturedStdout();
     if (verbose) std::cout << text;
-    ASSERT_THAT(text, MatchesRegex(".*Open of file xxx.* failed.*"));
+    ASSERT_THAT(text, MatchesRegex(".*Open of file .*xxx.* failed.*"));
 
     ::testing::internal::CaptureStdout();
     lmp->input->one(fmt::format(loadfmt, "nve2"));
@@ -426,8 +431,7 @@ TEST_F(SimpleCommandsTest, Plugin)
     lmp->input->one("plugin unload pair nve2");
     text = ::testing::internal::GetCapturedStdout();
     if (verbose) std::cout << text;
-    ASSERT_THAT(text, MatchesRegex(".*Ignoring unload of pair style nve2: "
-                                   "not loaded from a plugin.*"));
+    ASSERT_THAT(text, MatchesRegex(".*Ignoring unload of pair style nve2: not from a plugin.*"));
 
     ::testing::internal::CaptureStdout();
     lmp->input->one("plugin unload fix nve2");
@@ -439,8 +443,7 @@ TEST_F(SimpleCommandsTest, Plugin)
     lmp->input->one("plugin unload fix nve");
     text = ::testing::internal::GetCapturedStdout();
     if (verbose) std::cout << text;
-    ASSERT_THAT(text, MatchesRegex(".*Ignoring unload of fix style nve: "
-                                   "not loaded from a plugin.*"));
+    ASSERT_THAT(text, MatchesRegex(".*Ignoring unload of fix style nve: not from a plugin.*"));
 
     ::testing::internal::CaptureStdout();
     lmp->input->one("plugin list");
diff --git a/unittest/commands/test_variables.cpp b/unittest/commands/test_variables.cpp
index 4f603df5ac..fb0aa58069 100644
--- a/unittest/commands/test_variables.cpp
+++ b/unittest/commands/test_variables.cpp
@@ -59,8 +59,8 @@ protected:
     void TearDown() override
     {
         LAMMPSTest::TearDown();
-        unlink("test_variable.file");
-        unlink("test_variable.atomfile");
+        platform::unlink("test_variable.file");
+        platform::unlink("test_variable.atomfile");
     }
 
     void atomic_system()
@@ -165,7 +165,7 @@ TEST_F(VariableTest, CreateDelete)
     fputs(" ", fp);
     fclose(fp);
     ASSERT_THAT(variable->retrieve("file"), StrEq("1"));
-    unlink("MYFILE");
+    platform::unlink("MYFILE");
     ASSERT_THAT(variable->retrieve("file"), StrEq("0"));
 
     BEGIN_HIDE_OUTPUT();
@@ -317,7 +317,7 @@ TEST_F(VariableTest, Expressions)
     ASSERT_TRUE(variable->equalstyle(ivar));
     ASSERT_DOUBLE_EQ(variable->compute_equal(ivar), 2.0);
     ASSERT_DOUBLE_EQ(variable->compute_equal("v_three"), 3.0);
-    ASSERT_FLOAT_EQ(variable->compute_equal("v_four"), MY_PI);
+    ASSERT_NEAR(variable->compute_equal("v_four"), MY_PI,1.0e-14);
     ASSERT_GE(variable->compute_equal("v_five"), 20210310);
     ASSERT_DOUBLE_EQ(variable->compute_equal("v_seven"), -1);
     ASSERT_DOUBLE_EQ(variable->compute_equal("v_eight"), 2.5);
diff --git a/unittest/cplusplus/CMakeLists.txt b/unittest/cplusplus/CMakeLists.txt
index b0b2550e8c..efd194b9d2 100644
--- a/unittest/cplusplus/CMakeLists.txt
+++ b/unittest/cplusplus/CMakeLists.txt
@@ -1,13 +1,13 @@
 
 add_executable(test_lammps_class test_lammps_class.cpp)
-target_link_libraries(test_lammps_class PRIVATE lammps GTest::GMockMain GTest::GTest GTest::GMock)
+target_link_libraries(test_lammps_class PRIVATE lammps GTest::GMockMain)
 add_test(LammpsClass test_lammps_class)
 set_tests_properties(LammpsClass PROPERTIES ENVIRONMENT "OMP_NUM_THREADS=1")
 
 add_executable(test_input_class test_input_class.cpp)
-target_link_libraries(test_input_class PRIVATE lammps GTest::GTest GTest::GTestMain)
+target_link_libraries(test_input_class PRIVATE lammps GTest::GTestMain)
 add_test(InputClass test_input_class)
 
 add_executable(test_error_class test_error_class.cpp)
-target_link_libraries(test_error_class PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_error_class PRIVATE lammps GTest::GMock)
 add_test(ErrorClass test_error_class)
diff --git a/unittest/cplusplus/test_input_class.cpp b/unittest/cplusplus/test_input_class.cpp
index 87bea36081..b1d8af28c6 100644
--- a/unittest/cplusplus/test_input_class.cpp
+++ b/unittest/cplusplus/test_input_class.cpp
@@ -77,8 +77,8 @@ TEST_F(Input_commands, from_file)
     lmp->input->file(cont_file);
     EXPECT_EQ(lmp->atom->natoms, 2);
 
-    unlink(demo_file);
-    unlink(cont_file);
+    platform::unlink(demo_file);
+    platform::unlink(cont_file);
 };
 
 TEST_F(Input_commands, from_line)
diff --git a/unittest/cplusplus/test_lammps_class.cpp b/unittest/cplusplus/test_lammps_class.cpp
index 663c7358d9..3a1bde51ff 100644
--- a/unittest/cplusplus/test_lammps_class.cpp
+++ b/unittest/cplusplus/test_lammps_class.cpp
@@ -363,11 +363,7 @@ TEST(LAMMPS_init, NoOpenMP)
     FILE *fp = fopen("in.lammps_class_noomp", "w");
     fputs("\n", fp);
     fclose(fp);
-#if defined(__WIN32)
-    _putenv("OMP_NUM_THREADS");
-#else
-    unsetenv("OMP_NUM_THREADS");
-#endif
+    platform::unsetenv("OMP_NUM_THREADS");
 
     const char *args[] = {"LAMMPS_init", "-in", "in.lammps_class_noomp", "-log", "none", "-nocite"};
     char **argv        = (char **)args;
diff --git a/unittest/force-styles/CMakeLists.libyaml b/unittest/force-styles/CMakeLists.libyaml
new file mode 100644
index 0000000000..51e8589f7b
--- /dev/null
+++ b/unittest/force-styles/CMakeLists.libyaml
@@ -0,0 +1,34 @@
+# Custom minimal -*- CMake -*- file for libyaml
+
+cmake_minimum_required(VERSION 3.10)
+project(libyaml VERSION 0.2.5
+  DESCRIPTION "LibYAML a YAML parser and emitter library"
+  LANGUAGES C
+  HOMEPAGE_URL https://pyyaml.org/wiki/LibYAML)
+
+# compilation settings and options
+option(BUILD_SHARED_LIBS "Build libYAML as a shared library" OFF)
+option(CMAKE_POSITION_INDEPENDENT_CODE "Create objects compatible with shared libraries" ON)
+
+include(GNUInstallDirs)
+
+add_library(yaml
+  src/api.c
+  src/dumper.c
+  src/emitter.c
+  src/loader.c
+  src/parser.c
+  src/reader.c
+  src/scanner.c
+  src/writer.c
+  )
+
+set(YAML_VERSION_STRING "${YAML_VERSION_MAJOR}.${YAML_VERSION_MINOR}.${YAML_VERSION_PATCH}")
+set(CONFIG_H_FILE "#ifndef LIBYAML_CONFIG_H\n#define LIBYAML_CONFIG_H\n")
+set(CONFIG_H_FILE "${CONFIG_H_FILE}#define YAML_VERSION_MAJOR ${libyaml_VERSION_MAJOR}\n")
+set(CONFIG_H_FILE "${CONFIG_H_FILE}#define YAML_VERSION_MINOR ${libyaml_VERSION_MINOR}\n")
+set(CONFIG_H_FILE "${CONFIG_H_FILE}#define YAML_VERSION_PATCH ${libyaml_VERSION_PATCH}\n")
+set(CONFIG_H_FILE "${CONFIG_H_FILE}#define YAML_VERSION_STRING \"${libyaml_VERSION_MAJOR}.${libyaml_VERSION_MINOR}.${libyaml_VERSION_PATCH}\"\n#endif\n")
+file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/include/config.h "${CONFIG_H_FILE}")
+target_compile_definitions(yaml PRIVATE YAML_DECLARE_STATIC HAVE_CONFIG_H)
+target_include_directories(yaml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/unittest/force-styles/CMakeLists.txt b/unittest/force-styles/CMakeLists.txt
index 75e95c3bf0..464cd9426a 100644
--- a/unittest/force-styles/CMakeLists.txt
+++ b/unittest/force-styles/CMakeLists.txt
@@ -1,10 +1,24 @@
 
 find_package(YAML)
 if(NOT YAML_FOUND)
+  set(YAML_URL "https://pyyaml.org/download/libyaml/yaml-0.2.5.tar.gz" CACHE STRING "URL for libyaml tarball")
+  set(YAML_MD5 "bb15429d8fb787e7d3f1c83ae129a999" CACHE STRING "MD5 checksum of libyaml tarball")
+  mark_as_advanced(YAML_URL)
+  mark_as_advanced(YAML_MD5)
+
   # download and build a local copy of libyaml
-  include(YAML)
+  include(ExternalCMakeProject)
+  ExternalCMakeProject(libyaml ${YAML_URL} ${YAML_MD5} yaml . CMakeLists.libyaml)
+  add_library(Yaml::Yaml ALIAS yaml)
 endif()
 
+function(extract_tags out yaml_file)
+  file(STRINGS ${yaml_file} TAGS_LINE REGEX "^tags:")
+  string(REPLACE "tags:" "" TAGS_LINE "${TAGS_LINE}")
+  string(REGEX MATCHALL "[^, \t\r\n]+" TAGS "${TAGS_LINE}")
+  set(${out} "${TAGS}" PARENT_SCOPE)
+endfunction()
+
 if(CMAKE_VERSION VERSION_LESS 3.12)
   # adjust so we find Python 3 versions before Python 2 on old systems with old CMake
   set(Python_ADDITIONAL_VERSIONS 3.8 3.7 3.6 3.5)
@@ -24,9 +38,14 @@ endif()
 
 set(TEST_INPUT_FOLDER ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 add_library(style_tests STATIC yaml_writer.cpp error_stats.cpp test_config_reader.cpp test_main.cpp)
-target_compile_definitions(style_tests PRIVATE -DTEST_INPUT_FOLDER=${TEST_INPUT_FOLDER})
+if(YAML_FOUND)
+  target_compile_definitions(style_tests PRIVATE TEST_INPUT_FOLDER=${TEST_INPUT_FOLDER})
+else()
+  # we always use static linkage with local compiled libyaml
+  target_compile_definitions(style_tests PRIVATE TEST_INPUT_FOLDER=${TEST_INPUT_FOLDER} YAML_DECLARE_STATIC)
+endif()
 target_include_directories(style_tests PRIVATE ${LAMMPS_SOURCE_DIR})
-target_link_libraries(style_tests PUBLIC GTest::GTest GTest::GMock Yaml::Yaml lammps)
+target_link_libraries(style_tests PUBLIC gmock Yaml::Yaml lammps)
 if(BUILD_MPI)
   target_link_libraries(style_tests PUBLIC MPI::MPI_CXX)
 else()
@@ -43,7 +62,7 @@ endif()
 # unit test for error stats class
 add_executable(test_error_stats test_error_stats.cpp)
 target_include_directories(test_error_stats PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LAMMPS_SOURCE_DIR})
-target_link_libraries(test_error_stats PRIVATE GTest::GTestMain GTest::GTest)
+target_link_libraries(test_error_stats PRIVATE gtest_main)
 add_test(NAME ErrorStats COMMAND test_error_stats)
 
 # pair style tester
@@ -61,24 +80,30 @@ if(FFT_SINGLE)
 endif()
 foreach(TEST ${MOL_PAIR_TESTS})
   string(REGEX REPLACE "^.*mol-pair-(.*)\.yaml" "MolPairStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_pair_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # tests for metal-like atomic systems and related pair styles
 file(GLOB ATOMIC_PAIR_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/atomic-pair-*.yaml)
 foreach(TEST ${ATOMIC_PAIR_TESTS})
   string(REGEX REPLACE "^.*atomic-pair-(.*)\.yaml" "AtomicPairStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_pair_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # tests for Si-like manybody systems and related pair styles
 file(GLOB MANYBODY_PAIR_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/manybody-pair-*.yaml)
 foreach(TEST ${MANYBODY_PAIR_TESTS})
   string(REGEX REPLACE "^.*manybody-pair-(.*)\.yaml" "ManybodyPairStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_pair_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # bond style tester
@@ -88,8 +113,10 @@ target_link_libraries(test_bond_style PRIVATE lammps style_tests)
 file(GLOB BOND_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/bond-*.yaml)
 foreach(TEST ${BOND_TESTS})
   string(REGEX REPLACE "^.*bond-(.*)\.yaml" "BondStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_bond_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # angle style tester
@@ -99,8 +126,10 @@ target_link_libraries(test_angle_style PRIVATE lammps style_tests)
 file(GLOB ANGLE_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/angle-*.yaml)
 foreach(TEST ${ANGLE_TESTS})
   string(REGEX REPLACE "^.*angle-(.*)\.yaml" "AngleStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_angle_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # kspace style tester, currently uses the pair style tool
@@ -111,8 +140,10 @@ if(FFT_SINGLE)
 endif()
 foreach(TEST ${KSPACE_TESTS})
   string(REGEX REPLACE "^.*kspace-(.*)\.yaml" "KSpaceStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_pair_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # tester for timestepping fixes
@@ -126,8 +157,10 @@ target_link_libraries(test_fix_timestep PRIVATE lammps style_tests)
 file(GLOB FIX_TIMESTEP_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/fix-timestep-*.yaml)
 foreach(TEST ${FIX_TIMESTEP_TESTS})
   string(REGEX REPLACE "^.*fix-timestep-(.*)\.yaml" "FixTimestep:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_fix_timestep ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:${LAMMPS_PYTHON_DIR}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # dihedral style tester
@@ -137,8 +170,10 @@ target_link_libraries(test_dihedral_style PRIVATE lammps style_tests)
 file(GLOB DIHEDRAL_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/dihedral-*.yaml)
 foreach(TEST ${DIHEDRAL_TESTS})
   string(REGEX REPLACE "^.*dihedral-(.*)\.yaml" "DihedralStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_dihedral_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
 
 # improper style tester
@@ -148,6 +183,8 @@ target_link_libraries(test_improper_style PRIVATE lammps style_tests)
 file(GLOB IMPROPER_TESTS LIST_DIRECTORIES false ${TEST_INPUT_FOLDER}/improper-*.yaml)
 foreach(TEST ${IMPROPER_TESTS})
   string(REGEX REPLACE "^.*improper-(.*)\.yaml" "ImproperStyle:\\1" TNAME ${TEST})
+  extract_tags(TEST_TAGS ${TEST})
   add_test(NAME ${TNAME} COMMAND test_improper_style ${TEST} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   set_tests_properties(${TNAME} PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};PYTHONPATH=${TEST_INPUT_FOLDER}:$ENV{PYTHONPATH}")
+  set_tests_properties(${TNAME} PROPERTIES LABELS "${TEST_TAGS}")
 endforeach()
diff --git a/unittest/force-styles/test_config.h b/unittest/force-styles/test_config.h
index bf4f867f3b..ef4d911b60 100644
--- a/unittest/force-styles/test_config.h
+++ b/unittest/force-styles/test_config.h
@@ -16,6 +16,7 @@
 
 #include <set>
 #include <string>
+#include <sstream>
 #include <utility>
 #include <vector>
 
@@ -37,6 +38,7 @@ public:
     std::vector<std::pair<std::string, std::string>> prerequisites;
     std::vector<std::string> pre_commands;
     std::vector<std::string> post_commands;
+    std::vector<std::string> tags;
     std::string input_file;
     std::string pair_style;
     std::string bond_style;
@@ -96,6 +98,19 @@ public:
     }
     virtual ~TestConfig(){};
 
+    std::string tags_line() const
+    {
+      if(tags.size() > 0) {
+          std::stringstream line;
+          line << tags[0];
+          for(size_t i = 1; i < tags.size(); i++) {
+            line << ", " << tags[i];
+          }
+          return line.str();
+      }
+      return "generated";
+    }
+
 private:
     TestConfig(const TestConfig &){};
 };
diff --git a/unittest/force-styles/test_config_reader.cpp b/unittest/force-styles/test_config_reader.cpp
index 65e9e4b0f6..e6cf73f10e 100644
--- a/unittest/force-styles/test_config_reader.cpp
+++ b/unittest/force-styles/test_config_reader.cpp
@@ -27,10 +27,12 @@
 #include <vector>
 
 using LAMMPS_NS::utils::split_words;
+using LAMMPS_NS::utils::trim;
 
 TestConfigReader::TestConfigReader(TestConfig &config) : YamlReader(), config(config)
 {
     consumers["lammps_version"] = &TestConfigReader::lammps_version;
+    consumers["tags"]           = &TestConfigReader::tags;
     consumers["date_generated"] = &TestConfigReader::date_generated;
     consumers["epsilon"]        = &TestConfigReader::epsilon;
     consumers["skip_tests"]     = &TestConfigReader::skip_tests;
@@ -367,3 +369,12 @@ void TestConfigReader::global_vector(const yaml_event_t &event)
         config.global_vector.push_back(value);
     }
 }
+
+void TestConfigReader::tags(const yaml_event_t &event)
+{
+    std::stringstream data((char *)event.data.scalar.value);
+    config.tags.clear();
+    for (std::string tag; std::getline(data, tag, ','); ) {
+        config.tags.push_back(trim(tag));
+    }
+}
diff --git a/unittest/force-styles/test_config_reader.h b/unittest/force-styles/test_config_reader.h
index bf69f149fe..1f0de8df0a 100644
--- a/unittest/force-styles/test_config_reader.h
+++ b/unittest/force-styles/test_config_reader.h
@@ -58,6 +58,7 @@ public:
     void run_energy(const yaml_event_t &event);
     void global_scalar(const yaml_event_t &event);
     void global_vector(const yaml_event_t &event);
+    void tags(const yaml_event_t &event);
 };
 
 #endif
diff --git a/unittest/force-styles/test_error_stats.cpp b/unittest/force-styles/test_error_stats.cpp
index 52ba3baae3..cadb4e7f2a 100644
--- a/unittest/force-styles/test_error_stats.cpp
+++ b/unittest/force-styles/test_error_stats.cpp
@@ -9,6 +9,11 @@
 #include "fmtlib_format.cpp"
 #include "fmtlib_os.cpp"
 
+// Windows may define this as a macro
+#if defined(max)
+#undef max
+#endif
+
 TEST(ErrorStats, test)
 {
     ErrorStats stats;
diff --git a/unittest/force-styles/test_main.cpp b/unittest/force-styles/test_main.cpp
index 1ad0b79455..2680804fdf 100644
--- a/unittest/force-styles/test_main.cpp
+++ b/unittest/force-styles/test_main.cpp
@@ -46,6 +46,9 @@ void write_yaml_header(YamlWriter *writer, TestConfig *cfg, const char *version)
     // lammps_version
     writer->emit("lammps_version", version);
 
+    // tags
+    writer->emit("tags", cfg->tags_line());
+
     // date_generated
     std::time_t now   = time(nullptr);
     std::string block = trim(ctime(&now));
diff --git a/unittest/force-styles/tests/angle-charmm.yaml b/unittest/force-styles/tests/angle-charmm.yaml
index 83c9609995..973e89169b 100644
--- a/unittest/force-styles/tests/angle-charmm.yaml
+++ b/unittest/force-styles/tests/angle-charmm.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:23 2021
-epsilon: 5e-13
+epsilon: 1e-12
 prerequisites: ! |
   atom full
   angle charmm
diff --git a/unittest/force-styles/tests/angle-class2.yaml b/unittest/force-styles/tests/angle-class2.yaml
index ffd48d7727..0b50cce4f6 100644
--- a/unittest/force-styles/tests/angle-class2.yaml
+++ b/unittest/force-styles/tests/angle-class2.yaml
@@ -1,7 +1,8 @@
 ---
-lammps_version: 10 Feb 2021
-date_generated: Fri Feb 26 23:09:23 2021
-epsilon: 2.5e-13
+lammps_version: 14 Dec 2021
+date_generated: Tue Dec 21 11:26:44 2021
+epsilon: 1e-12
+skip_tests:
 prerequisites: ! |
   atom full
   angle class2
@@ -21,73 +22,73 @@ angle_coeff: ! |
   1  ba  20.0  0.0 1.5 1.5
   3  ba  10.0 10.0 1.5 1.5
   4  ba   0.0 20.0 1.5 1.5
-equilibrium: 4 1.9216075064457565 1.9373154697137058 2.0943951023931953 1.8936822384138474
+equilibrium: 4 1.9216075064457567 1.9373154697137058 2.0943951023931953 1.8936822384138476
 extract: ! ""
 natoms: 29
-init_energy: 45.6315872862689
+init_energy: 46.440896837749044
 init_stress: ! |2-
-   1.0380900034455654e+02 -8.5596888576343744e+01  1.0543371457027396e+01  9.3533772092305199e+01 -3.0453078736699425e+01  1.6197471265279837e+00
+   1.1893382158997322e+02 -8.8291447119046992e+01 -1.8868912456859972e+00  1.1299617626314146e+02 -8.5358891009896780e+00  7.6967639957246794e+00
 init_forces: ! |2
-    1  4.7202113476907201e+01  9.2452097148049379e+00 -2.1894202744712533e+01
-    2 -1.7062797644104073e+00 -1.3815331678107317e+01 -1.1109216556988997e+01
-    3 -1.0818552868961547e+01  4.2757153900854306e+01  4.0134050204129267e+01
-    4 -1.6107291199158311e+01 -2.0450831607561582e+01  1.2394641799374179e+01
-    5 -4.1997303662023540e+01 -4.1453062742803219e+01 -1.7536792897035419e+01
-    6  5.6639792403457349e+01 -1.3580769610788053e+01 -3.9712060060785255e+01
-    7 -1.8054439727078506e+01  1.6983062508083172e+01  1.5587306726806713e+00
-    8 -1.0525801597483865e+01 -1.5574411954009244e+01  8.4303126221362277e+01
-    9 -1.4606126670427546e+01  1.7088847325775880e+01  3.5265027966992148e+00
-   10 -9.2168170333014618e+00  2.5310906045489432e+01 -6.6219876033939016e+01
+    1  4.9843771864003834e+01  3.0925122596911816e+00 -3.5312722193312226e+01
+    2 -7.6547115157834666e-01 -6.1978334793953476e+00 -4.9838162348189323e+00
+    3 -2.0132137208074248e+01  6.1565994561526715e+01  5.8548546756498347e+01
+    4 -1.1800898073962109e+01 -2.7342191490108263e+01  3.6024091179236257e+00
+    5 -3.5756372747916700e+01 -5.0661360858658405e+01 -1.9085859271498553e+01
+    6  5.0734800121225135e+01 -1.5633459340466136e+01 -3.6451541950394720e+01
+    7 -1.8030285239813821e+01  1.7009172193091111e+01  1.5544411167675971e+00
+    8 -1.5632958318497042e+01 -2.1571227015465091e+01  7.4636601692831320e+01
+    9 -1.4606126670427546e+01  1.7088847325775880e+01  3.5265027966992157e+00
+   10  4.9412614063426892e+00  3.2891848318072718e+01 -5.8999099084298045e+01
    11 -2.0537012066379056e+01 -7.0930129287428976e+00  7.5068656204459945e+00
-   12 -2.8436344574011141e-01 -6.5767250423083183e+00 -7.8126221608778286e+00
-   13  3.1055904718024561e+00  8.1431271170459514e+00 -1.2597779054058647e+00
-   14  1.6321646299623836e+01 -6.4737292023940052e+00 -6.0969666237587319e+00
-   15  1.2014703003264437e+01 -4.7819952969181587e+00  1.5497667618539472e+01
-   16  9.3845211462413065e+00  9.2380044230210405e+00  6.9332654848904189e+00
-   17 -8.1437876633224571e-01  1.0335590285580882e+00 -2.1333543461785509e-01
-   18  4.7908728028679270e-01  1.6800089801308631e+00 -5.5268875505867383e+00
-   19 -2.0533806941129176e+00 -2.3525964439530416e+00  2.1320670955883561e+00
-   20  1.5742934138261249e+00  6.7258746382217860e-01  3.3948204549983823e+00
-   21  3.9487378529081880e+00  4.3175316427012014e+00 -1.1481919601133793e+01
-   22 -6.5477089696018744e+00 -4.1918690971369452e+00  3.4304260006102911e+00
-   23  2.5989711166936864e+00 -1.2566254556425660e-01  8.0514936005235018e+00
-   24 -8.4094048868779181e-01  4.6838060982897600e+00 -2.5056979703433946e+00
-   25 -1.3589399152326838e+00 -3.2194810517526529e+00  2.1726606908305435e-01
-   26  2.1998804039204756e+00 -1.4643250465371074e+00  2.2884319012603402e+00
-   27 -1.3120258950869723e-01  2.3248353742613195e+00 -7.0687277526256109e-01
-   28 -7.9798149428972343e-01 -1.3623378004463826e+00 -7.5274691791676882e-02
-   29  9.2918408379842066e-01 -9.6249757381493684e-01  7.8214746705423799e-01
-run_energy: 45.4459195202412
+   12  1.3804183606931298e+01  6.4447653590032914e+00 -2.7232856026007504e+01
+   13  3.5360349619705711e-01  9.1396982628926138e-01  4.4753951517849107e+00
+   14  9.6171168365401805e+00 -1.2052302426663841e+01 -3.9493877514992031e+00
+   15  6.4860723734536965e+00 -7.3468753785680629e+00  2.2291265160011950e+01
+   16 -2.1230106290243054e+00  1.3874285421512146e+01  6.8398804299585079e+00
+   17  3.6034624009792791e+00 -4.9831323468942506e+00  3.0333746689077179e+00
+   18  8.4504245756073892e-01  5.5042293626682808e+00 -2.1046811461460457e+01
+   19 -6.5342341140829419e+00 -7.9348302620004993e+00  8.7785874834232587e+00
+   20  5.6891916565222029e+00  2.4306008993322186e+00  1.2268223978037199e+01
+   21  7.8687847056106097e+00  9.2247825951126554e+00 -2.6147407661059020e+01
+   22 -1.3528173582118603e+01 -8.9511461518324538e+00  8.6148798736605858e+00
+   23  5.6593888765079932e+00 -2.7363644328020120e-01  1.7532527787398436e+01
+   24 -4.3228091413199312e+00  1.8218035887352222e+01 -1.0563459250035294e+01
+   25 -4.1518972409293955e+00 -1.2576945219214169e+01  1.7476216234117312e+00
+   26  8.4747063822493267e+00 -5.6410906681380535e+00  8.8158376266235621e+00
+   27 -1.9941459534406549e+00  1.7618141002760062e+01 -5.9162802776486032e+00
+   28 -4.8714180504657234e+00 -1.0506429973856317e+01  1.3714179563487131e-01
+   29  6.8655640039063783e+00 -7.1117110289037466e+00  5.7791384820137317e+00
+run_energy: 46.04254213414856
 run_stress: ! |2-
-   1.0224296939567702e+02 -8.5149951148281033e+01  1.1077450496851872e+01  9.2245165502849829e+01 -3.1084418227269154e+01  5.2366663320491313e-01
+   1.1725321346181148e+02 -8.7851127936553254e+01 -1.2820805825603960e+00  1.1164348489319656e+02 -9.3238396977559397e+00  6.5868867241338807e+00
 run_forces: ! |2
-    1  4.6696948163060675e+01  9.5469544165543052e+00 -2.1330948302985508e+01
-    2 -1.7341210851964273e+00 -1.4128897827282087e+01 -1.1408863992275160e+01
-    3 -1.0105551089272240e+01  4.2277702428791336e+01  3.9657670069675063e+01
-    4 -1.6122111935066883e+01 -2.0098532812588481e+01  1.2653235408843411e+01
-    5 -4.2111565648427224e+01 -4.1283493356078523e+01 -1.7589964848850386e+01
-    6  5.6633343610296642e+01 -1.3630420678353568e+01 -3.9740642386699271e+01
-    7 -1.8067962126721181e+01  1.7005582120491120e+01  1.5568169485445109e+00
-    8 -1.0459403976386902e+01 -1.5611162457913967e+01  8.4226500676174069e+01
-    9 -1.4631300686651667e+01  1.7116506905325277e+01  3.5366459989463483e+00
-   10 -9.1067423535925318e+00  2.5083637022662394e+01 -6.6120603070314331e+01
-   11 -2.0531188000382802e+01 -7.0572039110412836e+00  7.4976926119744087e+00
-   12 -4.5804053460484440e-01 -6.7408368318088137e+00 -7.3612432437675679e+00
-   13  3.0256565331949181e+00  7.9773964875310250e+00 -1.1599249084139895e+00
-   14  1.6325797226942768e+01 -6.0364594808671850e+00 -6.0596304628457736e+00
-   15  1.2032950547216590e+01 -4.6547008559207024e+00  1.4984306358892226e+01
-   16  9.4390365741788855e+00  9.1831264349550388e+00  6.8856125869099509e+00
-   17 -8.2574521858778427e-01  1.0508023955441044e+00 -2.2665944380799952e-01
-   18  4.1662989783566795e-01  1.4355996853101134e+00 -4.6916036738076876e+00
-   19 -1.7454972366844239e+00 -2.0016596312326365e+00  1.8125096846165301e+00
-   20  1.3288673388487560e+00  5.6605994592252307e-01  2.8790939891911576e+00
-   21  3.7257661846966093e+00  4.0481711597815977e+00 -1.0827752619904448e+01
-   22 -6.1593135050903065e+00 -3.9305106214838132e+00  3.2463364124342222e+00
-   23  2.4335473203936973e+00 -1.1766053829778456e-01  7.5814162074702249e+00
-   24 -7.6007405188534261e-01  4.2086623877720974e+00 -2.2508903613389659e+00
-   25 -1.2107922910240914e+00 -2.8924997143595577e+00  1.9797185345341334e-01
-   26  1.9708663429094340e+00 -1.3161626734125400e+00  2.0529185078855527e+00
-   27 -1.2690270648686752e-01  2.1785298399231308e+00 -6.5914069723159696e-01
-   28 -7.4337188418354749e-01 -1.2769741286441925e+00 -7.1075298614674390e-02
-   29  8.7027459067041502e-01 -9.0155571127893830e-01  7.3021599584627139e-01
+    1  4.9310488854556048e+01  3.4204699131043999e+00 -3.4719646460390109e+01
+    2 -8.0299294685685085e-01 -6.5454065929116556e+00 -5.2863994125400326e+00
+    3 -1.9344188743451483e+01  6.1008236345827427e+01  5.7990450617574737e+01
+    4 -1.1820915021801847e+01 -2.6960419197209372e+01  3.9252515610422387e+00
+    5 -3.5905034771726200e+01 -5.0435101635947014e+01 -1.9146824911182328e+01
+    6  5.0729278972279332e+01 -1.5703691297521928e+01 -3.6490325001299325e+01
+    7 -1.8043757960499846e+01  1.7038616039733434e+01  1.5516900804512148e+00
+    8 -1.5552591757457865e+01 -2.1589081417070179e+01  7.4580844287823936e+01
+    9 -1.4632487393832747e+01  1.7116771489893083e+01  3.5369114786793467e+00
+   10  5.0177324966376649e+00  3.2655409113932286e+01 -5.8914010758009439e+01
+   11 -2.0526018243356614e+01 -7.0603095274350611e+00  7.4991583014498691e+00
+   12  1.3523522841409502e+01  6.2514718031216852e+00 -2.6685798485651077e+01
+   13  3.0043046271322960e-01  7.8431109074530148e-01  4.5328635576848786e+00
+   14  9.6740453376714228e+00 -1.1603919453433887e+01 -3.9205080570168040e+00
+   15  6.5345442098934683e+00 -7.2391735244141353e+00  2.1740136588792641e+01
+   16 -2.0494434617404407e+00  1.3825958913895089e+01  6.7881976680816249e+00
+   17  3.5873871255632257e+00 -4.9641420643094554e+00  3.0180089445086313e+00
+   18  7.9531230378610207e-01  5.2629626454507630e+00 -2.0172279025467247e+01
+   19 -6.1997999385006839e+00 -7.5645533008417640e+00  8.4605118212170112e+00
+   20  5.4044876347145818e+00  2.3015906553910015e+00  1.1711767204250236e+01
+   21  7.6462499201271452e+00  8.9238366526271982e+00 -2.5489460284469285e+01
+   22 -1.3116044133706563e+01 -8.6583255717724796e+00  8.4434021193141291e+00
+   23  5.4697942135794175e+00 -2.6551108085471875e-01  1.7046058165155156e+01
+   24 -4.2497082139356905e+00  1.7742951256065766e+01 -1.0303002556901220e+01
+   25 -3.9777181755411437e+00 -1.2246864111150519e+01  1.7323644652289971e+00
+   26  8.2274263894768342e+00 -5.4960871449152453e+00  8.5706380916722225e+00
+   27 -2.0048516565658536e+00  1.7446299537442144e+01 -5.8315780788096188e+00
+   28 -4.7894277572886974e+00 -1.0406054769833135e+01  1.3027326655161572e-01
+   29  6.7942794138545510e+00 -7.0402447676090096e+00  5.7013048122580035e+00
 ...
diff --git a/unittest/force-styles/tests/angle-class2_p6.yaml b/unittest/force-styles/tests/angle-class2_p6.yaml
index 72ee6626c8..367dc2f450 100644
--- a/unittest/force-styles/tests/angle-class2_p6.yaml
+++ b/unittest/force-styles/tests/angle-class2_p6.yaml
@@ -1,7 +1,8 @@
 ---
-lammps_version: 10 Feb 2021
-date_generated: Fri Feb 26 23:09:23 2021
-epsilon: 2.5e-13
+lammps_version: 14 Dec 2021
+date_generated: Tue Dec 21 11:26:44 2021
+epsilon: 5e-13
+skip_tests:
 prerequisites: ! |
   atom full
   angle class2/p6
@@ -21,73 +22,73 @@ angle_coeff: ! |
   1  ba  20.0  0.0 1.5 1.5
   3  ba  10.0 10.0 1.5 1.5
   4  ba   0.0 20.0 1.5 1.5
-equilibrium: 4 1.9216075064457565 1.9373154697137058 2.0943951023931953 1.8936822384138474
+equilibrium: 4 1.9216075064457567 1.9373154697137058 2.0943951023931953 1.8936822384138476
 extract: ! ""
 natoms: 29
-init_energy: 45.6314933281677
+init_energy: 46.440802879647805
 init_stress: ! |2-
-   1.0380655176146676e+02 -8.5598294304263064e+01  1.0547225768036466e+01  9.3533997442530364e+01 -3.0452940351933286e+01  1.6213985364060581e+00
+   1.1893137300688346e+02 -8.8292852846966269e+01 -1.8830369346769591e+00  1.1299640161336661e+02 -8.5357507162235464e+00  7.6984154056027378e+00
 init_forces: ! |2
-    1  4.7202230480286389e+01  9.2446782757168187e+00 -2.1892853730746999e+01
-    2 -1.7062799943363052e+00 -1.3815333539761312e+01 -1.1109218053986531e+01
-    3 -1.0819442137600213e+01  4.2755643579721394e+01  4.0131978148937094e+01
-    4 -1.6107331786015202e+01 -2.0450833016170897e+01  1.2394698445624217e+01
-    5 -4.1996775288710346e+01 -4.1451276926607676e+01 -1.7536151963373857e+01
-    6  5.6640330905804802e+01 -1.3581373316126099e+01 -3.9714075413905050e+01
-    7 -1.8054191673295325e+01  1.6982840171711775e+01  1.5587849314106879e+00
-    8 -1.0524752828621686e+01 -1.5572494690475857e+01  8.4307006012653105e+01
-    9 -1.4606116746275049e+01  1.7088845087132931e+01  3.5265010989798085e+00
-   10 -9.2183983294618113e+00  2.5308833813269253e+01 -6.6220570658369525e+01
-   11 -2.0536178089795950e+01 -7.0922505905966178e+00  7.5061958159305595e+00
-   12 -2.8507497680544613e-01 -6.5761946491161254e+00 -7.8131884119322397e+00
-   13  3.1055913097859387e+00  8.1431291943766304e+00 -1.2597770427903630e+00
-   14  1.6321556949530429e+01 -6.4737931388148002e+00 -6.0969371866598330e+00
-   15  1.2014697273266693e+01 -4.7819939276460772e+00  1.5497663357748195e+01
-   16  9.3845137052789855e+00  9.2380146356987876e+00  6.9332800900251534e+00
-   17 -8.1437877303591466e-01  1.0335590376878656e+00 -2.1333543954444023e-01
-   18  4.7908727278798890e-01  1.6800089017685975e+00 -5.5268872325673071e+00
-   19 -2.0533806022955563e+00 -2.3525963295672470e+00  2.1320669593942201e+00
-   20  1.5742933295075674e+00  6.7258742779864933e-01  3.3948202731730870e+00
-   21  3.9487365123920992e+00  4.3175299645965515e+00 -1.1481914586060395e+01
-   22 -6.5477065825321601e+00 -4.1918674696340856e+00  3.4304242277121375e+00
-   23  2.5989700701400609e+00 -1.2566249496246562e-01  8.0514903583482571e+00
-   24 -8.4094044796366130e-01  4.6838059399926504e+00 -2.5056978760993669e+00
-   25 -1.3589398825660985e+00 -3.2194809423072277e+00  2.1726605118393005e-01
-   26  2.1998803305297598e+00 -1.4643249976854225e+00  2.2884318249154370e+00
-   27 -1.3120258888530634e-01  2.3248353691437700e+00 -7.0687277351935407e-01
-   28 -7.9798149292664289e-01 -1.3623377973865247e+00 -7.5274691862757134e-02
-   29  9.2918408181194923e-01 -9.6249757175724537e-01  7.8214746538211122e-01
-run_energy: 45.4458285940593
+    1  4.9843888867383036e+01  3.0919808206030721e+00 -3.5311373179346681e+01
+    2 -7.6547138150424543e-01 -6.1978353410493483e+00 -4.9838177318164663e+00
+    3 -2.0133026476712903e+01  6.1564484240393824e+01  5.8546474701306153e+01
+    4 -1.1800938660819003e+01 -2.7342192898717585e+01  3.6024657641736728e+00
+    5 -3.5755844374603512e+01 -5.0659575042462862e+01 -1.9085218337836995e+01
+    6  5.0735338623572588e+01 -1.5634063045804186e+01 -3.6453557303514501e+01
+    7 -1.8030037186030640e+01  1.7008949856719713e+01  1.5544953754976136e+00
+    8 -1.5631909549634866e+01 -2.1569309751931716e+01  7.4640481484122148e+01
+    9 -1.4606116746275049e+01  1.7088845087132935e+01  3.5265010989798080e+00
+   10  4.9396801101823433e+00  3.2889776085852560e+01 -5.8999793708728561e+01
+   11 -2.0536178089795950e+01 -7.0922505905966178e+00  7.5061958159305604e+00
+   12  1.3803472075865960e+01  6.4452957521954772e+00 -2.7233422277061912e+01
+   13  3.5360433418053783e-01  9.1397190361994329e-01  4.4753960144004115e+00
+   14  9.6170274864467693e+00 -1.2052366363084635e+01 -3.9493583144003042e+00
+   15  6.4860666434559526e+00 -7.3468740092959814e+00  2.2291260899220667e+01
+   16 -2.1230180699866263e+00  1.3874295634189895e+01  6.8398950350932433e+00
+   17  3.6034623942756108e+00 -4.9831323377644736e+00  3.0333746639811334e+00
+   18  8.4504245006193557e-01  5.5042292843060157e+00 -2.1046811143441026e+01
+   19 -6.5342340222655810e+00 -7.9348301476147052e+00  8.7785873472291236e+00
+   20  5.6891915722036455e+00  2.4306008633086891e+00  1.2268223796211903e+01
+   21  7.8687833650945196e+00  9.2247809170080028e+00 -2.6147402645985622e+01
+   22 -1.3528171195048888e+01 -8.9511445243295924e+00  8.6148781007624322e+00
+   23  5.6593878299543681e+00 -2.7363639267841045e-01  1.7532524545223190e+01
+   24 -4.3228091005958014e+00  1.8218035729055110e+01 -1.0563459155791266e+01
+   25 -4.1518972082628096e+00 -1.2576945109768744e+01  1.7476216055126070e+00
+   26  8.4747063088586110e+00 -5.6410906192863681e+00  8.8158375502786583e+00
+   27 -1.9941459528172638e+00  1.7618140997642513e+01 -5.9162802759053958e+00
+   28 -4.8714180491026431e+00 -1.0506429970796459e+01  1.3714179556379100e-01
+   29  6.8655640019199069e+00 -7.1117110268460557e+00  5.7791384803416053e+00
+run_energy: 46.04245232805462
 run_stress: ! |2-
-   1.0224054425179162e+02 -8.5151354715203126e+01  1.1081280882090672e+01  9.2245389115064398e+01 -3.1084294543117402e+01  5.2530572051471558e-01
+   1.1725078883087194e+02 -8.7852531332776422e+01 -1.2782508811995987e+00  1.1164371026778049e+02 -9.3237176305934248e+00  6.5885234111489295e+00
 run_forces: ! |2
-    1  4.6697063144899758e+01  9.5464292184313084e+00 -2.1329615486413495e+01
-    2 -1.7341213469343071e+00 -1.4128899311205867e+01 -1.1408865229626386e+01
-    3 -1.0106423829890657e+01  4.2276216760085092e+01  3.9655618981253937e+01
-    4 -1.6122153804166530e+01 -2.0098535935851185e+01  1.2653292967815302e+01
-    5 -4.2111045474433908e+01 -4.1281732299841927e+01 -1.7589329659171515e+01
-    6  5.6633872451307042e+01 -1.3631017853516695e+01 -3.9742648484553712e+01
-    7 -1.8067710814224181e+01  1.7005356519278848e+01  1.5568722867815019e+00
-    8 -1.0458368774535296e+01 -1.5609267760781632e+01  8.4230369919214809e+01
-    9 -1.4631291097137083e+01  1.7116505329041448e+01  3.5366441447941890e+00
-   10 -9.1083165882262076e+00  2.5081593622661249e+01 -6.6121298210985884e+01
-   11 -2.0530357257878578e+01 -7.0564447871249403e+00  7.4970247183653580e+00
-   12 -4.5873997835014801e-01 -6.7403152848189372e+00 -7.3618095936615644e+00
-   13  3.0256572433426814e+00  7.9773984059331964e+00 -1.1599242567025270e+00
-   14  1.6325706512702343e+01 -6.0365250687180749e+00 -6.0596004500231722e+00
-   15  1.2032945183866531e+01 -4.6546995895395877e+00  1.4984301967909428e+01
-   16  9.4390297519529085e+00  9.1831355042234861e+00  6.8856259318712949e+00
-   17 -8.2574532229437936e-01  1.0508025317442151e+00 -2.2665954686757872e-01
-   18  4.1662989358421232e-01  1.4355996421935044e+00 -4.6916034993533247e+00
-   19 -1.7454971864999060e+00 -2.0016595685522596e+00  1.8125096096904596e+00
-   20  1.3288672929156937e+00  5.6605992635875502e-01  2.8790938896628648e+00
-   21  3.7257651397379274e+00  4.0481698601097618e+00 -1.0827748711362894e+01
-   22 -6.1593116505043302e+00 -3.9305093611054267e+00  3.2463350269770475e+00
-   23  2.4335465107664027e+00 -1.1766049900433539e-01  7.5814136843858462e+00
-   24 -7.6007402633523591e-01  4.2086622887143186e+00 -2.2508903023980524e+00
-   25 -1.2107922707762828e+00 -2.8924996458903025e+00  1.9797184221876996e-01
-   26  1.9708662971115187e+00 -1.3161626428240165e+00  2.0529184601792827e+00
-   27 -1.2690270602781828e-01  2.1785298361926788e+00 -6.5914069596767511e-01
-   28 -7.4337188319492098e-01 -1.2769741264135199e+00 -7.1075298663885261e-02
-   29  8.7027458922273926e-01 -9.0155570977915866e-01  7.3021599463156039e-01
+    1  4.9310604335655682e+01  3.4199456337672487e+00 -3.4718316191911107e+01
+    2 -8.0299317271452264e-01 -6.5454081333117653e+00 -5.2864006747626604e+00
+    3 -1.9345060126536442e+01  6.1006753750400470e+01  5.7988403049659411e+01
+    4 -1.1820957153105841e+01 -2.6960422682106536e+01  3.9253092142830135e+00
+    5 -3.5904516097808205e+01 -5.0433344031821314e+01 -1.9146190690682918e+01
+    6  5.0729808044888514e+01 -1.5704289267936588e+01 -3.6492333161602460e+01
+    7 -1.8043506480508537e+01  1.7038390298430201e+01  1.5517454554100727e+00
+    8 -1.5551557230614444e+01 -2.1587186230569777e+01  7.4584717878493251e+01
+    9 -1.4632477702925939e+01  1.7116769814092336e+01  3.5369096257061594e+00
+   10  5.0161578855418893e+00  3.2653366582115979e+01 -5.8914707964769292e+01
+   11 -2.0525187366514267e+01 -7.0595502886286221e+00  7.4984902661515953e+00
+   12  1.3522823846822945e+01  6.2519930281569955e+00 -2.6686365074578834e+01
+   13  3.0043118312382699e-01  7.8431307047186760e-01  4.5328641811452499e+00
+   14  9.6739544192178748e+00 -1.1603985265483198e+01 -3.9204779603734119e+00
+   15  6.5345389143893406e+00 -7.2391722622226151e+00  2.1740132205563690e+01
+   16 -2.0494503520587521e+00  1.3825967887607892e+01  6.7882109957643362e+00
+   17  3.5873870531468781e+00 -4.9641419029625879e+00  3.0180088465038910e+00
+   18  7.9531229968056927e-01  5.2629626038099762e+00 -2.0172278856982508e+01
+   19 -6.1997998900621667e+00 -7.5645532403308380e+00  8.4605117488474431e+00
+   20  5.4044875903815974e+00  2.3015906365208618e+00  1.1711767108135064e+01
+   21  7.6462488915955822e+00  8.9238353733854954e+00 -2.5489456437353834e+01
+   22 -1.3116042308945529e+01 -8.6583243315139509e+00  8.4434007553408890e+00
+   23  5.4697934173499467e+00 -2.6551104187154384e-01  1.7046055682012945e+01
+   24 -4.2497081893768893e+00  1.7742951160854016e+01 -1.0303002500248269e+01
+   25 -3.9777181560986339e+00 -1.2246864045350220e+01  1.7323644544198329e+00
+   26  8.2274263454755232e+00 -5.4960871155037951e+00  8.5706380458284350e+00
+   27 -2.0048516561480048e+00  1.7446299534047586e+01 -5.8315780776594286e+00
+   28 -4.7894277563898591e+00 -1.0406054767803472e+01  1.3027326650644638e-01
+   29  6.7942794125378638e+00 -7.0402447662441157e+00  5.7013048111529825e+00
 ...
diff --git a/unittest/force-styles/tests/angle-cosine_delta.yaml b/unittest/force-styles/tests/angle-cosine_delta.yaml
index e59eda767f..1141e4ceff 100644
--- a/unittest/force-styles/tests/angle-cosine_delta.yaml
+++ b/unittest/force-styles/tests/angle-cosine_delta.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:23 2021
-epsilon: 2.5e-13
+epsilon: 5e-13
 prerequisites: ! |
   atom full
   angle cosine/delta
diff --git a/unittest/force-styles/tests/angle-cosine_shift.yaml b/unittest/force-styles/tests/angle-cosine_shift.yaml
index 0518db6d83..507f75e70e 100644
--- a/unittest/force-styles/tests/angle-cosine_shift.yaml
+++ b/unittest/force-styles/tests/angle-cosine_shift.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:23 2021
-epsilon: 2.5e-13
+epsilon: 1e-12
 prerequisites: ! |
   atom full
   angle cosine/shift
diff --git a/unittest/force-styles/tests/angle-harmonic.yaml b/unittest/force-styles/tests/angle-harmonic.yaml
index 7490bf6764..6e884a40e0 100644
--- a/unittest/force-styles/tests/angle-harmonic.yaml
+++ b/unittest/force-styles/tests/angle-harmonic.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:24 2021
-epsilon: 5e-13
+epsilon: 7.5e-13
 prerequisites: ! |
   atom full
   angle harmonic
diff --git a/unittest/force-styles/tests/angle-quartic.yaml b/unittest/force-styles/tests/angle-quartic.yaml
index 3fd927ac4a..b28882840d 100644
--- a/unittest/force-styles/tests/angle-quartic.yaml
+++ b/unittest/force-styles/tests/angle-quartic.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:25 2021
-epsilon: 4e-13
+epsilon: 7.5e-13
 prerequisites: ! |
   atom full
   angle quartic
diff --git a/unittest/force-styles/tests/atomic-pair-adp.yaml b/unittest/force-styles/tests/atomic-pair-adp.yaml
index 8d8a707bff..852fa103b6 100644
--- a/unittest/force-styles/tests/atomic-pair-adp.yaml
+++ b/unittest/force-styles/tests/atomic-pair-adp.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:58 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-atm.yaml b/unittest/force-styles/tests/atomic-pair-atm.yaml
index 0fc90345a3..76189db0aa 100644
--- a/unittest/force-styles/tests/atomic-pair-atm.yaml
+++ b/unittest/force-styles/tests/atomic-pair-atm.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:58 2021
 epsilon: 5e-12
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_point.yaml b/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_point.yaml
index 31a44ce897..d79bfd9955 100644
--- a/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_point.yaml
+++ b/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_point.yaml
@@ -2,6 +2,7 @@
 lammps_version: 8 Apr 2021
 date_generated: Tue Apr 20 14:47:51 2021
 epsilon: 7.5e-13
+tags: unstable
 skip_tests: intel single gpu
 prerequisites: ! |
   pair buck/coul/cut
diff --git a/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_shielded.yaml b/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_shielded.yaml
index ca05006535..9b8773c67e 100644
--- a/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_shielded.yaml
+++ b/unittest/force-styles/tests/atomic-pair-buck_coul_cut_qeq_shielded.yaml
@@ -2,6 +2,7 @@
 lammps_version: 8 Apr 2021
 date_generated: Tue Apr 20 14:48:00 2021
 epsilon: 7.5e-13
+tags: unstable
 skip_tests: intel single gpu
 prerequisites: ! |
   pair buck/coul/cut
diff --git a/unittest/force-styles/tests/atomic-pair-eam_fs.yaml b/unittest/force-styles/tests/atomic-pair-eam_fs.yaml
index 58c533d7ec..88b2512214 100644
--- a/unittest/force-styles/tests/atomic-pair-eam_fs.yaml
+++ b/unittest/force-styles/tests/atomic-pair-eam_fs.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:01 2021
 epsilon: 5e-12
 skip_tests: single
diff --git a/unittest/force-styles/tests/atomic-pair-eam_fs_real.yaml b/unittest/force-styles/tests/atomic-pair-eam_fs_real.yaml
index f7739ee824..290a567708 100644
--- a/unittest/force-styles/tests/atomic-pair-eam_fs_real.yaml
+++ b/unittest/force-styles/tests/atomic-pair-eam_fs_real.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:01 2021
 epsilon: 7.5e-12
 skip_tests: single
diff --git a/unittest/force-styles/tests/atomic-pair-edip.yaml b/unittest/force-styles/tests/atomic-pair-edip.yaml
index 91b7fd8db4..9d01cbaf18 100644
--- a/unittest/force-styles/tests/atomic-pair-edip.yaml
+++ b/unittest/force-styles/tests/atomic-pair-edip.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow, unstable
 date_generated: Fri Feb 26 23:09:02 2021
-epsilon: 7.5e-13
+epsilon: 7e-9
 prerequisites: ! |
   pair edip
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-eim.yaml b/unittest/force-styles/tests/atomic-pair-eim.yaml
index 4814029a3c..e705e88614 100644
--- a/unittest/force-styles/tests/atomic-pair-eim.yaml
+++ b/unittest/force-styles/tests/atomic-pair-eim.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:02 2021
-epsilon: 1e-11
+epsilon: 2e-11
 prerequisites: ! |
   pair eim
 pre_commands: ! ""
diff --git a/unittest/force-styles/tests/atomic-pair-hybrid-eam.yaml b/unittest/force-styles/tests/atomic-pair-hybrid-eam.yaml
index 737b054dd0..86b03d3a34 100644
--- a/unittest/force-styles/tests/atomic-pair-hybrid-eam.yaml
+++ b/unittest/force-styles/tests/atomic-pair-hybrid-eam.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 8 Apr 2021
 date_generated: Mon Apr 19 08:49:08 2021
-epsilon: 1e-11
+epsilon: 2e-10
+tags: unstable
 skip_tests: single
 prerequisites: ! |
   pair eam/fs
diff --git a/unittest/force-styles/tests/atomic-pair-hybrid-eam_fs.yaml b/unittest/force-styles/tests/atomic-pair-hybrid-eam_fs.yaml
index d45dc9190e..8b9cd93505 100644
--- a/unittest/force-styles/tests/atomic-pair-hybrid-eam_fs.yaml
+++ b/unittest/force-styles/tests/atomic-pair-hybrid-eam_fs.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 8 Apr 2021
+tags: slow
 date_generated: Mon Apr 19 08:49:08 2021
 epsilon: 5e-12
 skip_tests: single
diff --git a/unittest/force-styles/tests/atomic-pair-meam.yaml b/unittest/force-styles/tests/atomic-pair-meam.yaml
index 08193987f8..70bb16a330 100644
--- a/unittest/force-styles/tests/atomic-pair-meam.yaml
+++ b/unittest/force-styles/tests/atomic-pair-meam.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:03 2021
 epsilon: 2.5e-12
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-meam_spline.yaml b/unittest/force-styles/tests/atomic-pair-meam_spline.yaml
index 9ee9ee13fb..0de2fba11c 100644
--- a/unittest/force-styles/tests/atomic-pair-meam_spline.yaml
+++ b/unittest/force-styles/tests/atomic-pair-meam_spline.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:03 2021
 epsilon: 5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-meam_sw_spline.yaml b/unittest/force-styles/tests/atomic-pair-meam_sw_spline.yaml
index b9f330e6de..13f1761f8f 100644
--- a/unittest/force-styles/tests/atomic-pair-meam_sw_spline.yaml
+++ b/unittest/force-styles/tests/atomic-pair-meam_sw_spline.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow, unstable
 date_generated: Fri Feb 26 23:09:03 2021
-epsilon: 1e-14
+epsilon: 5e-13
 prerequisites: ! |
   pair meam/sw/spline
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff-acks2.yaml b/unittest/force-styles/tests/atomic-pair-reaxff-acks2.yaml
index 2444e76746..0026e9d715 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff-acks2.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff-acks2.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 31 Aug 2021
+tags: slow, unstable
 date_generated: Tue Sep 21 15:02:20 2021
-epsilon: 5e-9
+epsilon: 7.5e-9
 skip_tests: omp
 prerequisites: ! |
   pair reaxff
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff-acks2_efield.yaml b/unittest/force-styles/tests/atomic-pair-reaxff-acks2_efield.yaml
index 295b343dea..e6f9599494 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff-acks2_efield.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff-acks2_efield.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 29 Sep 2021
+tags: slow, unstable
 date_generated: Wed Oct 13 18:05:37 2021
 epsilon: 5e-09
 skip_tests: omp
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff.yaml b/unittest/force-styles/tests/atomic-pair-reaxff.yaml
index ccb88d6ca4..1d540363bb 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Mon Aug 23 20:32:03 2021
 epsilon: 2e-10
 skip_tests:
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
index b4afe5942a..363f7cf74d 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Mon Aug 23 20:32:03 2021
 epsilon: 4e-12
 skip_tests:
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
index 4fb058a2af..f02caf13cc 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
@@ -1,7 +1,9 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Mon Aug 23 20:32:04 2021
-epsilon: 5e-13
+epsilon: 2e-11
+tags: unstable
 skip_tests:
 prerequisites: ! |
   pair reaxff
diff --git a/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
index dd8f5d8103..5acd407191 100644
--- a/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Mon Aug 23 20:32:05 2021
 epsilon: 1e-12
 skip_tests:
diff --git a/unittest/force-styles/tests/bond-fene_nm.yaml b/unittest/force-styles/tests/bond-fene_nm.yaml
new file mode 100644
index 0000000000..3877257e18
--- /dev/null
+++ b/unittest/force-styles/tests/bond-fene_nm.yaml
@@ -0,0 +1,90 @@
+---
+lammps_version: 27 Oct 2021
+date_generated: Fri Dec  3 14:19:54 2021
+epsilon: 2.5e-13
+skip_tests:
+prerequisites: ! |
+  atom full
+  bond fene/nm
+pre_commands: ! ""
+post_commands: ! ""
+input_file: in.fourmol
+bond_style: fene/nm
+bond_coeff: ! |
+  1 250 3   0.022 1.5 12 6
+  2 300 2.2 0.005 1.1 12 6
+  3 350 2.6 0.022 1.3 12 6
+  4 650 2.4 0.015 1.2 12 6
+  5 450 2   0.018 1   12 6
+equilibrium: 5 1.455 1.067 1.261 1.164 0.97
+extract: ! |
+  kappa 1
+  r0 1
+natoms: 29
+init_energy: 7104.538647187164
+init_stress: ! |-
+  -4.9973815323625913e+03 -4.7361840472269523e+03 -6.8040395883503370e+03 -2.4269128277669373e+02 -6.6891138241387910e+02 -4.4331343074095361e+02
+init_forces: ! |2
+    1  1.5241450143465187e+02 -3.0525977546573915e+02 -6.3458098267970195e+02
+    2 -3.5350460570830404e+02 -2.9090700339261923e+02  4.1606484768643804e+02
+    3  1.0551830305471503e+02  2.2401937719372620e+02  1.4606307001232116e+02
+    4  3.5536921099636709e+02 -9.9624087144545939e+01  2.5214357319063441e+02
+    5  7.9720020262024519e+01  1.2250810321475352e+02 -4.0706067052901318e+02
+    6  1.0184572276286774e+02 -1.8391374409419583e+02 -5.1597162813841442e+02
+    7 -4.0003839507681207e+01  2.0519187988030288e+02  1.0237011026132983e+03
+    8  1.2654592554252514e+02  2.3377280771604660e+01  1.4927958489160193e+02
+    9 -1.2784757582569983e+02 -1.3769968301295265e+02 -5.6576982718291401e+02
+   10  5.3144960748116091e+01  2.0682679138164244e+02 -1.1012793207246887e+02
+   11  9.0056609319223625e+01  2.2307819276380619e+02  3.6602637579091163e+02
+   12 -7.9803260536482895e+01 -2.2713463100625972e+01 -5.6423395492467080e+01
+   13 -4.2195172960814563e+02  1.6691974609782486e+02  7.9306125211916001e+00
+   14  1.6125522409974846e+02 -3.2813299981539174e+01  4.1818673855319895e+02
+   15  1.1077438848232475e+01 -4.2564645386754370e+02 -1.5168532391605805e+02
+   16 -5.8443350578374441e+02  5.8509422994783404e+02  6.4542809784238375e+02
+   17  3.7059659990158599e+02 -2.5843809119173306e+02 -9.8320424309094221e+02
+   18  8.9256815127765776e+00  1.6559918238482518e+02 -7.0012791058798700e+02
+   19  3.6803420870363448e+02  2.3544833528119514e+02  4.4586262125506545e+02
+   20 -3.7695989021641105e+02 -4.0104751766602033e+02  2.5426528933292158e+02
+   21  1.7530000271774043e+02  2.2134894285710348e+02 -6.6583012739562821e+02
+   22  3.2454001444420487e+02  7.8649201103816580e+01  5.0916711557197374e+02
+   23 -4.9984001716194530e+02 -2.9999814396092006e+02  1.5666301182365447e+02
+   24 -1.5679133653626826e+02  5.9193607682634206e+02 -3.5591848499590157e+02
+   25  4.6478359688537853e+02 -7.4823424107551176e+01  3.9073484597754532e+02
+   26 -3.0799226034911027e+02 -5.1711265271879086e+02 -3.4816360981643761e+01
+   27 -7.4220583788956276e+01  6.5764368695517567e+02 -2.2071984289822808e+02
+   28  4.6718290842881504e+02 -2.0077389283014068e+02  3.1610048906698665e+02
+   29 -3.9296232463985876e+02 -4.5686979412503501e+02 -9.5380646168758574e+01
+run_energy: 7049.171322970109
+run_stress: ! |-
+  -4.9544161293431789e+03 -4.6990550365348445e+03 -6.7357951494828403e+03 -2.3243877860807157e+02 -6.6038849056571530e+02 -4.4410638672086759e+02
+run_forces: ! |2
+    1  1.4925184076342319e+02 -3.0497744106969486e+02 -6.2784067871414572e+02
+    2 -3.5055563426526174e+02 -2.8969290648665157e+02  4.1046831743568976e+02
+    3  1.0822066499019527e+02  2.2086785758674949e+02  1.4249480367106142e+02
+    4  3.5360507900323955e+02 -9.8401837560273805e+01  2.5161742643297779e+02
+    5  7.9391884770676285e+01  1.2222910560980677e+02 -4.0476666600643921e+02
+    6  1.0148964708467207e+02 -1.8306087075965752e+02 -5.1023226196445353e+02
+    7 -3.9689179395515843e+01  2.0474963404677590e+02  1.0194778084830103e+03
+    8  1.2494972036721282e+02  2.2693521195529684e+01  1.4337384460208744e+02
+    9 -1.2680327854648024e+02 -1.3607572678714394e+02 -5.6105845449910407e+02
+   10  5.2822761335347593e+01  2.0635121732402425e+02 -1.0738766733609498e+02
+   11  8.9826435940718611e+01  2.2259034819967130e+02  3.6462605823988332e+02
+   12 -8.1329798000939590e+01 -2.4224591946313410e+01 -5.4503253889220076e+01
+   13 -4.2028471353530676e+02  1.6533657154228302e+02  7.7423382445151354e+00
+   14  1.6053023768271433e+02 -3.2120126037122041e+01  4.1639049425325879e+02
+   15  1.1795806566552764e+01 -4.2278785489326015e+02 -1.5159128537705374e+02
+   16 -5.8035825809369146e+02  5.8260240314584871e+02  6.3789796224434861e+02
+   17  3.6713678333244314e+02 -2.5607930311057180e+02 -9.7670878582032128e+02
+   18  1.1072106934268504e+01  1.6539902947978504e+02 -6.9048689626242947e+02
+   19  3.6443523192099309e+02  2.3398120441416819e+02  4.3945613416480091e+02
+   20 -3.7550733885526159e+02 -3.9938023389395323e+02  2.5103076209762855e+02
+   21  1.7289611658869148e+02  2.1683350109152377e+02 -6.5624862555490085e+02
+   22  3.2226229246629543e+02  7.8911796487282629e+01  5.0261026779229326e+02
+   23 -4.9515840905498692e+02 -2.9574529757880640e+02  1.5363835776260760e+02
+   24 -1.5408568685034606e+02  5.8248946315011631e+02 -3.4957921699182202e+02
+   25  4.5869392727836487e+02 -7.2090654936792617e+01  3.8544556642324892e+02
+   26 -3.0460824042801880e+02 -5.1039880821332372e+02 -3.5866349431426890e+01
+   27 -7.3889199777519138e+01  6.4943202311030745e+02 -2.1665571654975884e+02
+   28  4.6272717765429780e+02 -1.9779275870683830e+02  3.1223774949848246e+02
+   29 -3.8883797787677867e+02 -4.5163926440346921e+02 -9.5582032948723636e+01
+...
diff --git a/unittest/force-styles/tests/bond-gaussian.yaml b/unittest/force-styles/tests/bond-gaussian.yaml
index b5553fb053..944ff630c6 100644
--- a/unittest/force-styles/tests/bond-gaussian.yaml
+++ b/unittest/force-styles/tests/bond-gaussian.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:21 2021
-epsilon: 2.5e-13
+epsilon: 1e-12
 prerequisites: ! |
   atom full
   bond gaussian
diff --git a/unittest/force-styles/tests/dihedral-quadratic.yaml b/unittest/force-styles/tests/dihedral-quadratic.yaml
index b1c59e131f..e335e7d822 100644
--- a/unittest/force-styles/tests/dihedral-quadratic.yaml
+++ b/unittest/force-styles/tests/dihedral-quadratic.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: unstable
 date_generated: Fri Feb 26 23:09:35 2021
 epsilon: 1e-12
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/dihedral-table_cut_linear.yaml b/unittest/force-styles/tests/dihedral-table_cut_linear.yaml
index fdfa9bfcb5..00894aecdb 100644
--- a/unittest/force-styles/tests/dihedral-table_cut_linear.yaml
+++ b/unittest/force-styles/tests/dihedral-table_cut_linear.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Mar 2021
+tags: unstable
 date_generated: Tue Mar 23 08:05:02 202
-epsilon: 5e-14
+epsilon: 1e-13
 prerequisites: ! |
   atom full
   dihedral table/cut
diff --git a/unittest/force-styles/tests/dihedral-table_linear.yaml b/unittest/force-styles/tests/dihedral-table_linear.yaml
index 608208e275..d114e86b81 100644
--- a/unittest/force-styles/tests/dihedral-table_linear.yaml
+++ b/unittest/force-styles/tests/dihedral-table_linear.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Mar 2021
+tags: unstable
 date_generated: Mon Mar 22 21:19:05 202
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/fix-timestep-drag.yaml b/unittest/force-styles/tests/fix-timestep-drag.yaml
index fbcee25721..b7e61ab3ad 100644
--- a/unittest/force-styles/tests/fix-timestep-drag.yaml
+++ b/unittest/force-styles/tests/fix-timestep-drag.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:53 2021
-epsilon: 5e-14
+epsilon: 2e-13
 prerequisites: ! |
   atom full
   fix drag
diff --git a/unittest/force-styles/tests/fix-timestep-momentum.yaml b/unittest/force-styles/tests/fix-timestep-momentum.yaml
index 4ae3b928bf..de60504db5 100644
--- a/unittest/force-styles/tests/fix-timestep-momentum.yaml
+++ b/unittest/force-styles/tests/fix-timestep-momentum.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:54 2021
-epsilon: 5e-14
+epsilon: 2e-13
 prerequisites: ! |
   atom full
   fix momentum
diff --git a/unittest/force-styles/tests/fix-timestep-npt_sphere_tri.yaml b/unittest/force-styles/tests/fix-timestep-npt_sphere_tri.yaml
index a30917cbc6..204f09bdbf 100644
--- a/unittest/force-styles/tests/fix-timestep-npt_sphere_tri.yaml
+++ b/unittest/force-styles/tests/fix-timestep-npt_sphere_tri.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 30 Jul 2021
 date_generated: Sun Aug 22 14:15:09 2021
-epsilon: 5e-13
+epsilon: 1e-12
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-nvt.yaml b/unittest/force-styles/tests/fix-timestep-nvt.yaml
index 298b506ab2..500bb47c76 100644
--- a/unittest/force-styles/tests/fix-timestep-nvt.yaml
+++ b/unittest/force-styles/tests/fix-timestep-nvt.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:55 2021
-epsilon: 2e-13
+epsilon: 5e-13
 prerequisites: ! |
   atom full
   fix nvt
diff --git a/unittest/force-styles/tests/fix-timestep-rattle_angle.yaml b/unittest/force-styles/tests/fix-timestep-rattle_angle.yaml
index 92ac0b5cab..9c432c8db8 100644
--- a/unittest/force-styles/tests/fix-timestep-rattle_angle.yaml
+++ b/unittest/force-styles/tests/fix-timestep-rattle_angle.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:55 2021
-epsilon: 3e-10
+epsilon: 9e-10
+tags: unstable
 prerequisites: ! |
   atom full
   fix rattle
diff --git a/unittest/force-styles/tests/fix-timestep-rigid_single.yaml b/unittest/force-styles/tests/fix-timestep-rigid_single.yaml
index 61957af75d..4622e159d2 100644
--- a/unittest/force-styles/tests/fix-timestep-rigid_single.yaml
+++ b/unittest/force-styles/tests/fix-timestep-rigid_single.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:57 2021
-epsilon: 5e-13
+epsilon: 7.5e-13
 prerequisites: ! |
   atom full
   fix rigid
diff --git a/unittest/force-styles/tests/fix-timestep-shake_angle.yaml b/unittest/force-styles/tests/fix-timestep-shake_angle.yaml
index 3317f7a187..0b6c1f7093 100644
--- a/unittest/force-styles/tests/fix-timestep-shake_angle.yaml
+++ b/unittest/force-styles/tests/fix-timestep-shake_angle.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:58 2021
-epsilon: 3e-10
+tags: unstable
+epsilon: 9e-10
 prerequisites: ! |
   atom full
   fix shake
diff --git a/unittest/force-styles/tests/fix-timestep-temp_berendsen.yaml b/unittest/force-styles/tests/fix-timestep-temp_berendsen.yaml
index 8b2d946782..631f7c2cae 100644
--- a/unittest/force-styles/tests/fix-timestep-temp_berendsen.yaml
+++ b/unittest/force-styles/tests/fix-timestep-temp_berendsen.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:59 2021
-epsilon: 2e-14
+epsilon: 5e-14
 prerequisites: ! |
   atom full
   fix temp/berendsen
diff --git a/unittest/force-styles/tests/fix-timestep-temp_csld.yaml b/unittest/force-styles/tests/fix-timestep-temp_csld.yaml
index a25a54e733..8266fe11c1 100644
--- a/unittest/force-styles/tests/fix-timestep-temp_csld.yaml
+++ b/unittest/force-styles/tests/fix-timestep-temp_csld.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:59 2021
-epsilon: 2e-14
+epsilon: 5e-14
 prerequisites: ! |
   atom full
   fix temp/csld
diff --git a/unittest/force-styles/tests/fix-timestep-temp_csvr.yaml b/unittest/force-styles/tests/fix-timestep-temp_csvr.yaml
index c66a1f2f59..91de6e649c 100644
--- a/unittest/force-styles/tests/fix-timestep-temp_csvr.yaml
+++ b/unittest/force-styles/tests/fix-timestep-temp_csvr.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:59 2021
-epsilon: 3e-14
+epsilon: 5e-14
 prerequisites: ! |
   atom full
   fix temp/csvr
diff --git a/unittest/force-styles/tests/fix-timestep-temp_rescale.yaml b/unittest/force-styles/tests/fix-timestep-temp_rescale.yaml
index 9cb1a096e5..a97c507c86 100644
--- a/unittest/force-styles/tests/fix-timestep-temp_rescale.yaml
+++ b/unittest/force-styles/tests/fix-timestep-temp_rescale.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:59 2021
-epsilon: 2e-13
+epsilon: 5e-13
 prerequisites: ! |
   atom full
   fix temp/rescale
diff --git a/unittest/force-styles/tests/improper-harmonic.yaml b/unittest/force-styles/tests/improper-harmonic.yaml
index fb147a2e71..10f58c12b6 100644
--- a/unittest/force-styles/tests/improper-harmonic.yaml
+++ b/unittest/force-styles/tests/improper-harmonic.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Wed Feb 24 19:35:15 202
-epsilon: 2e-11
+epsilon: 5e-11
 prerequisites: ! |
   atom full
   improper harmonic
diff --git a/unittest/force-styles/tests/kspace-ewald_tilted.yaml b/unittest/force-styles/tests/kspace-ewald_tilted.yaml
index 18d6278187..c75eafdf12 100644
--- a/unittest/force-styles/tests/kspace-ewald_tilted.yaml
+++ b/unittest/force-styles/tests/kspace-ewald_tilted.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 30 Jul 2021
 date_generated: Tue Aug 24 16:00:03 2021
-epsilon: 8.5e-14
+epsilon: 1e-13
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/kspace-msm.yaml b/unittest/force-styles/tests/kspace-msm.yaml
index 13e2e0a2c2..d87e69ea69 100644
--- a/unittest/force-styles/tests/kspace-msm.yaml
+++ b/unittest/force-styles/tests/kspace-msm.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:26 2021
 epsilon: 5e-11
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-msm_cg.yaml b/unittest/force-styles/tests/kspace-msm_cg.yaml
index 90b5999796..49d5a420d1 100644
--- a/unittest/force-styles/tests/kspace-msm_cg.yaml
+++ b/unittest/force-styles/tests/kspace-msm_cg.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:26 2021
 epsilon: 5e-11
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-msm_nopbc.yaml b/unittest/force-styles/tests/kspace-msm_nopbc.yaml
index 9c071e90eb..ea02582ebf 100644
--- a/unittest/force-styles/tests/kspace-msm_nopbc.yaml
+++ b/unittest/force-styles/tests/kspace-msm_nopbc.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:27 2021
 epsilon: 5e-11
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_cg.yaml b/unittest/force-styles/tests/kspace-pppm_cg.yaml
index e6bb6c78f3..0e8ba9a541 100644
--- a/unittest/force-styles/tests/kspace-pppm_cg.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_cg.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:29 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_cg_ad.yaml b/unittest/force-styles/tests/kspace-pppm_cg_ad.yaml
index a8d9efc1fa..1883ed6cfe 100644
--- a/unittest/force-styles/tests/kspace-pppm_cg_ad.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_cg_ad.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:29 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
index 270ee0418c..9111789f95 100644
--- a/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_cg_tiled.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:29 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_dipole.yaml b/unittest/force-styles/tests/kspace-pppm_dipole.yaml
index 76c6516f51..e220730205 100644
--- a/unittest/force-styles/tests/kspace-pppm_dipole.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_dipole.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow
 date_generated: Sat Aug 21 20:49:01 2021
 epsilon: 5e-12
 skip_tests: extract gpu single
diff --git a/unittest/force-styles/tests/kspace-pppm_disp.yaml b/unittest/force-styles/tests/kspace-pppm_disp.yaml
index e7693c8980..2b09606811 100644
--- a/unittest/force-styles/tests/kspace-pppm_disp.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:30 2021
 epsilon: 2.5e-13
 skip_tests: intel
diff --git a/unittest/force-styles/tests/kspace-pppm_disp_ad.yaml b/unittest/force-styles/tests/kspace-pppm_disp_ad.yaml
index 11285b1a23..2aa71d6008 100644
--- a/unittest/force-styles/tests/kspace-pppm_disp_ad.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp_ad.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:31 2021
 epsilon: 2.5e-13
 skip_tests: intel
diff --git a/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml b/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
index 81380966d8..61e6cc2a8b 100644
--- a/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp_ad_only.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:31 2021
-epsilon: 2.5e-13
+epsilon: 5e-13
 skip_tests: intel
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/kspace-pppm_disp_tip4p.yaml b/unittest/force-styles/tests/kspace-pppm_disp_tip4p.yaml
index 047a623995..8fae3e1de0 100644
--- a/unittest/force-styles/tests/kspace-pppm_disp_tip4p.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_disp_tip4p.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:32 2021
 epsilon: 2.5e-13
 skip_tests: intel
diff --git a/unittest/force-styles/tests/kspace-pppm_slab.yaml b/unittest/force-styles/tests/kspace-pppm_slab.yaml
index aa1566aab5..2d9f1e939c 100644
--- a/unittest/force-styles/tests/kspace-pppm_slab.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_slab.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:32 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_stagger.yaml b/unittest/force-styles/tests/kspace-pppm_stagger.yaml
index ea24b6cfd3..58f684eef7 100644
--- a/unittest/force-styles/tests/kspace-pppm_stagger.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_stagger.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:33 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
index cb8ba3813e..ebb93f1e9a 100644
--- a/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_stagger_tiled.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:33 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_tiled.yaml b/unittest/force-styles/tests/kspace-pppm_tiled.yaml
index d1b95c8ca9..b68b19aa4e 100644
--- a/unittest/force-styles/tests/kspace-pppm_tiled.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tiled.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:33 2021
 epsilon: 7.5e-14
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_tip4p.yaml b/unittest/force-styles/tests/kspace-pppm_tip4p.yaml
index f5785b54d7..e40c14a713 100644
--- a/unittest/force-styles/tests/kspace-pppm_tip4p.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tip4p.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:33 2021
 epsilon: 2e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_tip4p_ad.yaml b/unittest/force-styles/tests/kspace-pppm_tip4p_ad.yaml
index 219addb5f0..56ce1ff511 100644
--- a/unittest/force-styles/tests/kspace-pppm_tip4p_ad.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tip4p_ad.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:34 2021
 epsilon: 4e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_tip4p_nozforce.yaml b/unittest/force-styles/tests/kspace-pppm_tip4p_nozforce.yaml
index 846e1a8605..4a75db579a 100644
--- a/unittest/force-styles/tests/kspace-pppm_tip4p_nozforce.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tip4p_nozforce.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:34 2021
 epsilon: 2e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/kspace-pppm_tip4p_slab.yaml b/unittest/force-styles/tests/kspace-pppm_tip4p_slab.yaml
index 35a8ef56f5..51c5a45941 100644
--- a/unittest/force-styles/tests/kspace-pppm_tip4p_slab.yaml
+++ b/unittest/force-styles/tests/kspace-pppm_tip4p_slab.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:34 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-airebo.yaml b/unittest/force-styles/tests/manybody-pair-airebo.yaml
index 58c81fb093..8b0eda5567 100644
--- a/unittest/force-styles/tests/manybody-pair-airebo.yaml
+++ b/unittest/force-styles/tests/manybody-pair-airebo.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:10 2021
 epsilon: 5e-06
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-airebo_00.yaml b/unittest/force-styles/tests/manybody-pair-airebo_00.yaml
index aa9aea8ca9..62a56f5ab1 100644
--- a/unittest/force-styles/tests/manybody-pair-airebo_00.yaml
+++ b/unittest/force-styles/tests/manybody-pair-airebo_00.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:11 2021
 epsilon: 1e-07
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-airebo_m.yaml b/unittest/force-styles/tests/manybody-pair-airebo_m.yaml
index ae1e670cea..67b14b05c9 100644
--- a/unittest/force-styles/tests/manybody-pair-airebo_m.yaml
+++ b/unittest/force-styles/tests/manybody-pair-airebo_m.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:11 2021
 epsilon: 1e-07
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-airebo_m00.yaml b/unittest/force-styles/tests/manybody-pair-airebo_m00.yaml
index eecfd2a08e..73c6054ec0 100644
--- a/unittest/force-styles/tests/manybody-pair-airebo_m00.yaml
+++ b/unittest/force-styles/tests/manybody-pair-airebo_m00.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:11 2021
 epsilon: 1e-07
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-bop.yaml b/unittest/force-styles/tests/manybody-pair-bop.yaml
index 8260b16d21..f809a2db66 100644
--- a/unittest/force-styles/tests/manybody-pair-bop.yaml
+++ b/unittest/force-styles/tests/manybody-pair-bop.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 8 Apr 2021
+tags: slow, unstable
 date_generated: Wed May  5 11:50:15 2021
-epsilon: 5e-13
+epsilon: 5e-12
 prerequisites: ! |
   pair bop
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-bop_save.yaml b/unittest/force-styles/tests/manybody-pair-bop_save.yaml
index e5699cf8b8..77388eb6d6 100644
--- a/unittest/force-styles/tests/manybody-pair-bop_save.yaml
+++ b/unittest/force-styles/tests/manybody-pair-bop_save.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 8 Apr 2021
+tags: slow, unstable
 date_generated: Wed May  5 11:50:24 2021
-epsilon: 9e-13
+epsilon: 2e-11
 prerequisites: ! |
   pair bop
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-comb.yaml b/unittest/force-styles/tests/manybody-pair-comb.yaml
index d41aa476db..b64cc2a6e9 100644
--- a/unittest/force-styles/tests/manybody-pair-comb.yaml
+++ b/unittest/force-styles/tests/manybody-pair-comb.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow, unstable
 date_generated: Fri Feb 26 23:09:14 2021
 epsilon: 1e-12
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-comb3.yaml b/unittest/force-styles/tests/manybody-pair-comb3.yaml
index 2238108055..6c8290cb10 100644
--- a/unittest/force-styles/tests/manybody-pair-comb3.yaml
+++ b/unittest/force-styles/tests/manybody-pair-comb3.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow, unstable
 date_generated: Fri Feb 26 23:09:14 2021
 epsilon: 1e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-drip.yaml b/unittest/force-styles/tests/manybody-pair-drip.yaml
index f36436183a..15e71eaad3 100644
--- a/unittest/force-styles/tests/manybody-pair-drip.yaml
+++ b/unittest/force-styles/tests/manybody-pair-drip.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:39 2021
-epsilon: 7.5e-11
+epsilon: 7.5e-09
 skip_tests: single
 prerequisites: ! |
   pair drip
diff --git a/unittest/force-styles/tests/manybody-pair-drip_real.yaml b/unittest/force-styles/tests/manybody-pair-drip_real.yaml
index 78a9d0e6a3..0a332c7c1f 100644
--- a/unittest/force-styles/tests/manybody-pair-drip_real.yaml
+++ b/unittest/force-styles/tests/manybody-pair-drip_real.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:41 2021
-epsilon: 2e-10
+epsilon: 8e-9
 skip_tests: single
 prerequisites: ! |
   pair drip
diff --git a/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn.yaml b/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn.yaml
index ec91e147ad..ee946d0534 100644
--- a/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn.yaml
+++ b/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:42 2021
-epsilon: 5e-13
+epsilon: 5e-12
 skip_tests: single
 prerequisites: ! |
   pair ilp/graphene/hbn
diff --git a/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn_notaper.yaml b/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn_notaper.yaml
index 51d72dc40b..ba7ccf6856 100644
--- a/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn_notaper.yaml
+++ b/unittest/force-styles/tests/manybody-pair-ilp-graphene-hbn_notaper.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:45 2021
-epsilon: 9e-13
+epsilon: 1e-11
 skip_tests: single
 prerequisites: ! |
   pair ilp/graphene/hbn
diff --git a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full.yaml b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full.yaml
index aa14a037d9..28430b75d4 100644
--- a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full.yaml
+++ b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:47 2021
-epsilon: 5e-12
+epsilon: 5e-11
 skip_tests: single
 prerequisites: ! |
   pair kolmogorov/crespi/full
diff --git a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full_notaper.yaml b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full_notaper.yaml
index dfa6249013..2e0fac74e1 100644
--- a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full_notaper.yaml
+++ b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_full_notaper.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Tue Aug 24 15:36:49 2021
 epsilon: 5e-12
 skip_tests: single
diff --git a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_z.yaml b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_z.yaml
index e59d9851ea..e1f3d7136b 100644
--- a/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_z.yaml
+++ b/unittest/force-styles/tests/manybody-pair-kolmogorov_crespi_z.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 30 Jul 2021
+tags: slow, unstable
 date_generated: Wed Aug 25 07:14:32 2021
 epsilon: 5e-13
 skip_tests: single
diff --git a/unittest/force-styles/tests/manybody-pair-lebedeva_z.yaml b/unittest/force-styles/tests/manybody-pair-lebedeva_z.yaml
index 5caeee3dca..84b44bcb0a 100644
--- a/unittest/force-styles/tests/manybody-pair-lebedeva_z.yaml
+++ b/unittest/force-styles/tests/manybody-pair-lebedeva_z.yaml
@@ -14,7 +14,7 @@ post_commands: ! ""
 input_file: in.bilayer
 pair_style: hybrid/overlay lebedeva/z 16.0
 pair_coeff: ! |
-  * * lebedeva/z CC.Lebedeva C C C
+  * * lebedeva/z CC.Lebedeva C1 C1 C1
 extract: ! ""
 natoms: 48
 init_vdwl: 2360.887727742073
diff --git a/unittest/force-styles/tests/manybody-pair-meam.yaml b/unittest/force-styles/tests/manybody-pair-meam.yaml
index 4237f5ffc2..fed2a060cf 100644
--- a/unittest/force-styles/tests/manybody-pair-meam.yaml
+++ b/unittest/force-styles/tests/manybody-pair-meam.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:15 2021
-epsilon: 7.5e-12
+epsilon: 1e-10
 prerequisites: ! |
   pair meam
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-mliap_nn.yaml b/unittest/force-styles/tests/manybody-pair-mliap_nn.yaml
index 5270c9571f..04f742536a 100644
--- a/unittest/force-styles/tests/manybody-pair-mliap_nn.yaml
+++ b/unittest/force-styles/tests/manybody-pair-mliap_nn.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Mar 2021
+tags: slow
 date_generated: Wed Mar 24 12:18:23 202
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-mliap_snap_chem.yaml b/unittest/force-styles/tests/manybody-pair-mliap_snap_chem.yaml
index 99f36d8c65..08ebfed3b3 100644
--- a/unittest/force-styles/tests/manybody-pair-mliap_snap_chem.yaml
+++ b/unittest/force-styles/tests/manybody-pair-mliap_snap_chem.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:16 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-mliap_snap_linear.yaml b/unittest/force-styles/tests/manybody-pair-mliap_snap_linear.yaml
index 3650405db1..43d7660823 100644
--- a/unittest/force-styles/tests/manybody-pair-mliap_snap_linear.yaml
+++ b/unittest/force-styles/tests/manybody-pair-mliap_snap_linear.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:15 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-mliap_snap_quadratic.yaml b/unittest/force-styles/tests/manybody-pair-mliap_snap_quadratic.yaml
index 475fc1caac..b07b4fa5b5 100644
--- a/unittest/force-styles/tests/manybody-pair-mliap_snap_quadratic.yaml
+++ b/unittest/force-styles/tests/manybody-pair-mliap_snap_quadratic.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Mar 2021
+tags: slow
 date_generated: Wed Mar 24 12:24:38 202
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-mliap_so3.yaml b/unittest/force-styles/tests/manybody-pair-mliap_so3.yaml
index 86fc0ac545..a4efa2c6e0 100644
--- a/unittest/force-styles/tests/manybody-pair-mliap_so3.yaml
+++ b/unittest/force-styles/tests/manybody-pair-mliap_so3.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 27 May 2021
+tags: slow, unstable
 date_generated: Sun Jun 13 17:44:07 2021
-epsilon: 2.5e-11
+epsilon: 5e-11
 skip_tests:
 prerequisites: ! |
   pair mliap
diff --git a/unittest/force-styles/tests/manybody-pair-nb3b_harmonic.yaml b/unittest/force-styles/tests/manybody-pair-nb3b_harmonic.yaml
index 1eef3f085b..4701c79ad3 100644
--- a/unittest/force-styles/tests/manybody-pair-nb3b_harmonic.yaml
+++ b/unittest/force-styles/tests/manybody-pair-nb3b_harmonic.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:17 2021
-epsilon: 1e-12
+tags: unstable
+epsilon: 2e-12
 prerequisites: ! |
   pair nb3b/harmonic
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-polymorphic_sw.yaml b/unittest/force-styles/tests/manybody-pair-polymorphic_sw.yaml
index cccc15256b..7e1e6acfd4 100644
--- a/unittest/force-styles/tests/manybody-pair-polymorphic_sw.yaml
+++ b/unittest/force-styles/tests/manybody-pair-polymorphic_sw.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:17 2021
-epsilon: 1.5e-13
+tags: unstable
+epsilon: 2.5e-12
 prerequisites: ! |
   pair polymorphic
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-polymorphic_tersoff.yaml b/unittest/force-styles/tests/manybody-pair-polymorphic_tersoff.yaml
index 0b12fd084c..f5706c6dde 100644
--- a/unittest/force-styles/tests/manybody-pair-polymorphic_tersoff.yaml
+++ b/unittest/force-styles/tests/manybody-pair-polymorphic_tersoff.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:09:17 2021
-epsilon: 1e-13
+tags: unstable
+epsilon: 5e-13
 prerequisites: ! |
   pair polymorphic
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-rann.yaml b/unittest/force-styles/tests/manybody-pair-rann.yaml
index a5fb526c83..b93631ce5d 100644
--- a/unittest/force-styles/tests/manybody-pair-rann.yaml
+++ b/unittest/force-styles/tests/manybody-pair-rann.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 27 May 2021
+tags: slow
 date_generated: Sun Jun 20 22:01:27 2021
-epsilon: 1e-13
+epsilon: 2e-13
 prerequisites: ! |
   pair rann
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-rebo.yaml b/unittest/force-styles/tests/manybody-pair-rebo.yaml
index 2e07aad8d6..bf9c43ec81 100644
--- a/unittest/force-styles/tests/manybody-pair-rebo.yaml
+++ b/unittest/force-styles/tests/manybody-pair-rebo.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:18 2021
 epsilon: 1e-07
 skip_tests: intel
diff --git a/unittest/force-styles/tests/manybody-pair-snap.yaml b/unittest/force-styles/tests/manybody-pair-snap.yaml
index 57e0e9d599..378229072e 100644
--- a/unittest/force-styles/tests/manybody-pair-snap.yaml
+++ b/unittest/force-styles/tests/manybody-pair-snap.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:18 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-snap_chem.yaml b/unittest/force-styles/tests/manybody-pair-snap_chem.yaml
index b97709863e..80b19b09f8 100644
--- a/unittest/force-styles/tests/manybody-pair-snap_chem.yaml
+++ b/unittest/force-styles/tests/manybody-pair-snap_chem.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:09:18 2021
 epsilon: 5e-13
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/manybody-pair-sw-mod-maxdelcs.yaml b/unittest/force-styles/tests/manybody-pair-sw-mod-maxdelcs.yaml
new file mode 100644
index 0000000000..02c4afee71
--- /dev/null
+++ b/unittest/force-styles/tests/manybody-pair-sw-mod-maxdelcs.yaml
@@ -0,0 +1,156 @@
+---
+lammps_version: 27 Oct 2021
+date_generated: Fri Dec  3 17:14:23 2021
+epsilon: 2e-08
+skip_tests:
+prerequisites: ! |
+  pair sw/mod
+pre_commands: ! |
+  variable newton_pair delete
+  if "$(is_active(package,gpu)) > 0.0" then "variable newton_pair index off" else "variable newton_pair index on"
+post_commands: ! ""
+input_file: in.manybody
+pair_style: sw/mod maxdelcs 0.20 0.40
+pair_coeff: ! |
+  * * CdTe.sw Cd Cd Cd Cd Te Te Te Te
+extract: ! ""
+natoms: 64
+init_vdwl: -102.90155182016774
+init_coul: 0
+init_stress: ! |-
+  -2.1947417260888678e+01 -2.2105178629076043e+01 -2.2782150007200460e+01 -9.6831443397089743e+00  2.8031085226335019e+01  2.6425790794270330e+00
+init_forces: ! |2
+    1 -8.2322511989895242e-01  2.6315476703135525e+00  1.6442126058355577e+00
+    2 -2.2311488847312719e+00 -8.2456809265006359e-01 -2.2293515519297400e+00
+    3  4.2989240623230440e-01 -4.6052129249824086e-01 -4.8909609892212980e-01
+    4 -1.5530833374407123e+00  2.5748774212987220e+00  1.5034912600724022e+00
+    5 -6.9171825412605936e-01 -1.0531847386600970e+00  9.2147776138588822e-02
+    6  1.5451416942873637e+00  3.1375544318493689e+00  2.4850508416452671e-01
+    7 -9.0063706268170907e-01 -2.7193406920151647e-01  5.1367456614538520e-01
+    8 -6.4440839188902421e-02  3.8701567174252616e-01 -2.4285825715258058e-01
+    9 -5.5617200225703267e-02 -2.1277279901291282e+00 -1.6680511813979533e+00
+   10  2.7595287122399431e-01 -1.3559527412339436e+00 -2.0239561413486844e+00
+   11  2.9100882412227174e+00 -3.1645938128174627e+00  1.7652131399585074e+00
+   12 -5.1610115075594178e+00 -3.0600096528148240e+00 -2.5793792565700739e+00
+   13 -9.9287698044288153e-01  5.1159440453263549e+00 -7.4140454237552722e-02
+   14 -1.3042563529153732e+00  3.2552405905184112e+00 -3.0303315732398806e-01
+   15 -3.0223048479632428e+00  2.8527569963119044e+00 -2.3811658931584292e+00
+   16  1.2225520471067990e+00  8.8148095840949869e-01  4.9430491876220550e+00
+   17  5.0768442926270452e-01  2.6034887477601787e+00 -3.6788718899516661e+00
+   18  8.3718278912359012e-01  1.6046320767230449e-01  3.0550065526090071e+00
+   19 -1.2883762451214118e+00 -2.3849116987845123e+00 -5.8477823649252847e-01
+   20 -5.5132529604846408e+00  4.3662399751356906e-01  8.3739708194443407e-01
+   21  1.9465564103522475e+00  3.3605256895156632e-01 -1.9812639945440429e+00
+   22 -5.3900034257758653e+00  2.7980876817713836e+00 -3.4581879904248547e+00
+   23 -6.9551235863135563e-01  1.7167183106714698e+00  1.8596769847634977e+00
+   24  1.0373451376796643e+00  2.8037482368183038e+00  1.5763756953329264e+00
+   25  8.5682777970106550e-01 -1.7513088261915449e-01  1.4385656519499774e+00
+   26 -6.0687244397992490e-01 -8.5621207324266868e-01  3.7445504803187213e-01
+   27  1.7542395648890141e+00 -9.7450364945012324e-01  1.0100795516082988e+00
+   28  1.4371612196473649e+00  7.8751799333407246e-01  4.0754691654932156e+00
+   29 -1.6851872634862173e+00  3.1860021588877185e-01 -1.9136043995523440e+00
+   30 -1.5120093558295022e-01  1.9897138579775446e+00 -7.7482882434134659e-01
+   31 -7.9238133559867108e-01  1.2389159718859988e+00 -9.7158187701085530e-01
+   32 -2.9467345906797955e+00 -2.2258546046845273e-01 -1.2952141841455944e+00
+   33  4.1055969687703975e+00 -2.1993536054909826e+00  2.3398821758640871e+00
+   34 -5.4666470780943710e-01 -5.1596374513625265e-01  4.4995707525341821e-01
+   35  6.5409470446315543e-01 -4.4929224101256049e-01  2.3033958551093261e+00
+   36  4.4102715770557099e-01 -1.9494304695990754e+00 -2.7711726811313486e-01
+   37 -1.1814185367881915e+00  1.5093409982347743e+00 -2.0145241812388726e+00
+   38  7.9979956273041686e-01 -6.2510014668964398e-01  2.2470258713803903e-01
+   39  9.1480685090248115e-01 -8.6858033858396233e-01 -3.0830126961472613e+00
+   40  2.8593879358814682e+00 -9.8501251752205943e-01 -9.5860078192014586e-01
+   41  2.1485211965273499e-01  5.2449517317149030e-02 -3.7464374392695876e-01
+   42 -3.1798376119681415e+00 -1.4672181113880249e+00 -3.2842910189007968e-01
+   43  5.4671499454984429e-01 -4.1359175442160714e+00 -1.8207740535793332e-01
+   44  2.9266008103038206e+00 -3.6068419804864180e+00  1.2113363596827269e+00
+   45  1.8852873543332602e-01  2.0862322450867481e+00 -2.2746048462286232e+00
+   46  3.1050327756207308e-01 -6.8131822103739004e-01  3.3099599857084856e+00
+   47  1.5495643784665116e+00  4.7838931757231606e-01  1.7237903621133266e+00
+   48 -1.0044710103079812e-01  1.9268843643577713e+00 -7.8383576073068006e-02
+   49 -7.5032605988628509e-01 -4.2197737225953258e+00 -2.3776532103934995e+00
+   50  1.6605480572872189e-01  2.1347903400364161e+00 -1.0644591517660023e+00
+   51 -1.0044546762645692e+00  3.4523503888228477e+00  2.5759868763834457e+00
+   52  3.3060194272186956e+00 -4.1095446389475212e+00  4.4973693213048920e+00
+   53  2.0013241503011043e+00  2.1568842444962932e+00  5.8576907510814702e-01
+   54  2.2795507897178497e+00 -1.6689794713655521e+00 -1.2265506067291636e+00
+   55  1.4414345631526004e+00 -2.3308559065173404e+00 -2.6744299494180348e+00
+   56 -1.2975056485899019e+00  1.2610549989098461e+00 -1.8919017562855065e+00
+   57  4.4635450423847911e-01 -1.8891747786298547e+00 -7.3598594916023358e-02
+   58  1.8732023203115655e+00 -1.2871222078269424e+00  1.1034998112961900e+00
+   59  2.5191023566928030e+00  1.3386335071903233e+00 -1.3910367397923291e+00
+   60 -1.3827430344664839e+00 -4.7488532834440305e+00 -2.6077765026678694e+00
+   61  5.6154005053867850e-01  1.6367914021307663e+00  2.2225403413594347e-01
+   62 -2.7117320012918236e+00  2.0222828099298611e+00 -1.0992589926255703e+00
+   63  1.1817576812178230e+00 -3.4891322995415415e+00  1.2117401144494494e-01
+   64  1.9765285883417301e+00  2.0768686745001230e+00  5.0210256117812220e+00
+run_vdwl: -102.8863066387519
+run_coul: 0
+run_stress: ! |-
+  -2.1976112124103381e+01 -2.2120675059593967e+01 -2.2736028697191660e+01 -9.6838383640863732e+00  2.7960565626594452e+01  2.9565139447975581e+00
+run_forces: ! |2
+    1 -8.2928907389174800e-01  2.6325653168237597e+00  1.6455283675511336e+00
+    2 -2.2332848280699098e+00 -8.5105428623120738e-01 -2.2378966509481462e+00
+    3  4.0369022873323995e-01 -4.5292332026834525e-01 -4.6117125396617076e-01
+    4 -1.5392368366681055e+00  2.5836742219004307e+00  1.4866213787271727e+00
+    5 -7.0473215115198462e-01 -1.0635760940667160e+00  1.0942226301158597e-01
+    6  1.5819649596788963e+00  3.1431003510990916e+00  2.7247970443500191e-01
+    7 -8.8304300352667098e-01 -2.5512778253202795e-01  4.9061185303106492e-01
+    8 -7.5212662515965478e-02  3.8184021196914131e-01 -2.4107282446941858e-01
+    9 -8.1158381339754615e-02 -2.1476254109562842e+00 -1.6738060835548429e+00
+   10  2.6739139374368631e-01 -1.3595489108881318e+00 -1.9877911558077250e+00
+   11  2.8786464650632282e+00 -3.1444207672765661e+00  1.7595851785110754e+00
+   12 -5.1607144125813420e+00 -3.0505635395731572e+00 -2.5663556019991880e+00
+   13 -9.5218249672773658e-01  5.1068327597016960e+00 -5.8594717769482718e-02
+   14 -1.3173839151099331e+00  3.2572743003293345e+00 -3.1725961245082179e-01
+   15 -3.0209968254234791e+00  2.8556138462073033e+00 -2.3721062599278038e+00
+   16  1.2216389778733623e+00  8.8782172928552650e-01  4.9460287363958439e+00
+   17  5.0665401626634665e-01  2.6065123676788002e+00 -3.6572621755786723e+00
+   18  8.0560082005674039e-01  1.7784443143431039e-01  3.0708425303505242e+00
+   19 -1.3130648174442099e+00 -2.4049562879242599e+00 -5.9798277275158895e-01
+   20 -5.5003890970086431e+00  4.2569363932569165e-01  8.2096048756974138e-01
+   21  1.9583117745178971e+00  3.1676077641277089e-01 -2.0010367607187307e+00
+   22 -5.3782300500239106e+00  2.7954251058532851e+00 -3.4849823165113722e+00
+   23 -6.9051290513228092e-01  1.7324097820905555e+00  1.8656696660166405e+00
+   24  1.0579198267435741e+00  2.7853782129568909e+00  1.5603367091482048e+00
+   25  8.5874832624451036e-01 -1.7002486393146604e-01  1.4197740747130343e+00
+   26 -6.1775651781646257e-01 -8.4257119820718573e-01  3.7388380056589404e-01
+   27  1.7623463963448633e+00 -9.9354238242365334e-01  1.0110685126053154e+00
+   28  1.4193733470738314e+00  8.1015999185641474e-01  4.0821004813182125e+00
+   29 -1.6859151458223569e+00  3.1541062526510977e-01 -1.9250490296611780e+00
+   30 -1.3973422008752046e-01  1.9763237734759858e+00 -7.7445674509188089e-01
+   31 -7.7767063241113066e-01  1.2364953391884987e+00 -9.6957454920708874e-01
+   32 -2.9270733416295300e+00 -2.4270762795162515e-01 -1.3071450818376427e+00
+   33  4.0910389624152463e+00 -2.1992798649239882e+00  2.3441501919110235e+00
+   34 -5.5252855074094653e-01 -5.1754953667376968e-01  4.5476750785742548e-01
+   35  6.5679903128682637e-01 -4.3384313669203550e-01  2.3020978443930540e+00
+   36  4.1071528470453234e-01 -1.9426423465248290e+00 -3.0227836648613948e-01
+   37 -1.1800352185266187e+00  1.5115663414777891e+00 -2.0145348898879654e+00
+   38  8.0863765118629871e-01 -6.2601029460599156e-01  2.2604437645930608e-01
+   39  9.0595391391512536e-01 -8.6453019059818215e-01 -3.0825022954203249e+00
+   40  2.8910511690578957e+00 -9.9304764149802571e-01 -9.7028813321576568e-01
+   41  2.3576231848917817e-01  8.3058105340779065e-02 -3.5051100914776356e-01
+   42 -3.1800149002647635e+00 -1.4801218321079967e+00 -3.4939613462705732e-01
+   43  5.3438430014286320e-01 -4.1254150427656411e+00 -1.9277595501740344e-01
+   44  2.9343930193092742e+00 -3.6039709656946148e+00  1.2044564920566443e+00
+   45  1.6277597014744849e-01  2.0604496639613474e+00 -2.2638998549421463e+00
+   46  2.9983573330855845e-01 -6.7843418740863404e-01  3.3353145685585344e+00
+   47  1.5634801627108743e+00  4.8214593664222316e-01  1.7100505686438834e+00
+   48 -8.3906105082361959e-02  1.9304713818450885e+00 -8.3812015887967162e-02
+   49 -7.8979636683125953e-01 -4.2585478453255936e+00 -2.4144301914148301e+00
+   50  1.7300163937082269e-01  2.1296234182507194e+00 -1.0768825899372374e+00
+   51 -1.0406604933510912e+00  3.4818747345114609e+00  2.5791884839408641e+00
+   52  3.3194817589169796e+00 -4.1027033529151336e+00  4.5046480939694513e+00
+   53  2.0294540274748609e+00  2.1853637373905008e+00  6.4245186153892442e-01
+   54  2.2771404377390034e+00 -1.6664001034755751e+00 -1.2118966400733897e+00
+   55  1.4338268965133238e+00 -2.3320008553509282e+00 -2.6364737369251454e+00
+   56 -1.2965342194582345e+00  1.2428509252727042e+00 -1.8883258466962363e+00
+   57  4.3952750262408874e-01 -1.9156058347182991e+00 -7.8543802781213368e-02
+   58  1.8765385567796204e+00 -1.2672717375405131e+00  1.0828240486180201e+00
+   59  2.5418882730334671e+00  1.3543932883696401e+00 -1.3969143290366981e+00
+   60 -1.4187287649587188e+00 -4.7767586078518711e+00 -2.6433630359859794e+00
+   61  5.7641865518569202e-01  1.6484372040093276e+00  2.3389519515043916e-01
+   62 -2.7071779218109575e+00  2.0159966095690671e+00 -1.0872338761779961e+00
+   63  1.1764921309976299e+00 -3.5078075687694996e+00  9.5738583931291754e-02
+   64  2.0160799277578452e+00  2.1172152881764932e+00  5.0470647349337190e+00
+...
diff --git a/unittest/force-styles/tests/manybody-pair-sw-mod-multi.yaml b/unittest/force-styles/tests/manybody-pair-sw-mod-multi.yaml
new file mode 100644
index 0000000000..206140428e
--- /dev/null
+++ b/unittest/force-styles/tests/manybody-pair-sw-mod-multi.yaml
@@ -0,0 +1,156 @@
+---
+lammps_version: 27 Oct 2021
+date_generated: Fri Dec  3 17:00:11 2021
+epsilon: 2e-08
+skip_tests:
+prerequisites: ! |
+  pair sw/mod
+pre_commands: ! |
+  variable newton_pair delete
+  if "$(is_active(package,gpu)) > 0.0" then "variable newton_pair index off" else "variable newton_pair index on"
+post_commands: ! ""
+input_file: in.manybody
+pair_style: sw/mod
+pair_coeff: ! |
+  * * CdTe.sw Cd Cd Cd Cd Te Te Te Te
+extract: ! ""
+natoms: 64
+init_vdwl: -102.37263341910496
+init_coul: 0
+init_stress: ! |-
+  -1.9299390308487332e+01 -1.8776425549919630e+01 -1.9374721168707158e+01 -9.9622288974183402e+00  2.8070027242773669e+01  2.3879337940188838e+00
+init_forces: ! |2
+    1 -8.3352103982839498e-01  2.6013559482542927e+00  1.6274474572425128e+00
+    2 -2.2306493745023439e+00 -8.0714318418132680e-01 -2.1923062020153332e+00
+    3  4.2806028767962845e-01 -4.3590991678773050e-01 -5.0075218062879157e-01
+    4 -1.5656435682231125e+00  2.5915183820303294e+00  1.5040666120009878e+00
+    5 -6.9060643334495364e-01 -1.0647727077215563e+00  1.0180213728505660e-01
+    6  1.5642465056067236e+00  3.1396473681405714e+00  2.3641961557945523e-01
+    7 -8.6200824722768510e-01 -3.1293943422778392e-01  5.8408431260171700e-01
+    8 -4.0266618929934772e-02  3.9766986295109796e-01 -2.5739252841289817e-01
+    9 -1.0121360928146172e-01 -2.1107078034640070e+00 -1.6464009666060164e+00
+   10  2.0923890660303709e-01 -1.3484233420079106e+00 -2.0722655526356872e+00
+   11  2.8965107909852348e+00 -3.1744989269831745e+00  1.7652663534808954e+00
+   12 -5.1795645231077438e+00 -3.1197777091864247e+00 -2.5665483054724385e+00
+   13 -9.7590684648521742e-01  5.1280259109893711e+00 -5.5091662114447909e-02
+   14 -1.3011518617770772e+00  3.2310626778618188e+00 -2.7215049371394051e-01
+   15 -3.0223115829476974e+00  2.8683423075628229e+00 -2.3758062762116352e+00
+   16  1.2188823395097397e+00  8.8545767129801856e-01  4.9357502869058907e+00
+   17  5.1082190544309336e-01  2.5961893628174249e+00 -3.6899394789612989e+00
+   18  8.0983992795309412e-01  1.9375641754963246e-01  3.0453558733140791e+00
+   19 -1.2881034138735021e+00 -2.3462670937269570e+00 -5.7655943278877531e-01
+   20 -5.5246134474571544e+00  4.3796064198379042e-01  7.8520754182924590e-01
+   21  1.9857375817468936e+00  3.5747034616111240e-01 -1.9976772359481998e+00
+   22 -5.3994091163314843e+00  2.7697124407401645e+00 -3.4667596880338745e+00
+   23 -6.7698775959152468e-01  1.7121888767603168e+00  1.8605652911619057e+00
+   24  1.0074715305644579e+00  2.7727617131893907e+00  1.6044473510176775e+00
+   25  8.2109744647411587e-01 -1.5951683209098438e-01  1.4578611299506363e+00
+   26 -6.0985382802125121e-01 -8.4244274451098200e-01  3.8129622162861987e-01
+   27  1.7520729823657701e+00 -9.7201192812768278e-01  1.0169775017077374e+00
+   28  1.4567082595385197e+00  7.8409740218617241e-01  4.0820526290440693e+00
+   29 -1.7032244062520407e+00  3.7587845180749746e-01 -1.8870964181160801e+00
+   30 -1.3206923648228786e-01  1.9916921471681335e+00 -7.8266372777153881e-01
+   31 -7.8806468309443956e-01  1.2340219176128437e+00 -9.6478187359875445e-01
+   32 -2.9476314789446243e+00 -2.3142140109749620e-01 -1.2913087888283064e+00
+   33  4.0628274986014370e+00 -2.1640961259319633e+00  2.3152290236418862e+00
+   34 -5.0178399673773300e-01 -5.0822695167360177e-01  4.4034006759355204e-01
+   35  6.4819559167223217e-01 -4.2103249402438037e-01  2.2856297398167160e+00
+   36  4.2421336892938027e-01 -1.9510682132362911e+00 -2.6576967318516032e-01
+   37 -1.1978825491980216e+00  1.4979581672416948e+00 -2.0147839781766854e+00
+   38  7.9840064761631246e-01 -6.3065633736217053e-01  2.2172788022363643e-01
+   39  9.5635170423204818e-01 -8.6585579126154955e-01 -3.1052170761751845e+00
+   40  2.8672098888058200e+00 -9.9651196656937779e-01 -9.5239701218087292e-01
+   41  2.4852080673692367e-01  6.1726598770146912e-02 -3.7738320667014214e-01
+   42 -3.2218924503111612e+00 -1.4679079242785487e+00 -3.2494955620470956e-01
+   43  5.5403553802134953e-01 -4.1887392096585874e+00 -2.0482127921175985e-01
+   44  2.9555129785833429e+00 -3.6105937273033035e+00  1.2015248443434716e+00
+   45  1.7770090288772786e-01  2.0291936769513712e+00 -2.2821357561478952e+00
+   46  3.0098477582343552e-01 -6.8109426553535413e-01  3.3143127536166697e+00
+   47  1.5621226433495043e+00  4.7692718861141892e-01  1.7210568305854430e+00
+   48 -9.4768998301996810e-02  1.9291106542258933e+00 -7.9898001163940746e-02
+   49 -7.5069898580068450e-01 -4.2258133032514049e+00 -2.3812996182822603e+00
+   50  1.4134224367113424e-01  2.1242374172137017e+00 -1.0801323306863431e+00
+   51 -1.0169916443243892e+00  3.4297591336369639e+00  2.6194829051700457e+00
+   52  3.2885617409945218e+00 -4.1236212282526887e+00  4.4781707305337832e+00
+   53  2.0148028372670694e+00  2.1448816982248586e+00  5.9971419207432852e-01
+   54  2.2814655737302960e+00 -1.7087786381643948e+00 -1.2499084070436326e+00
+   55  1.4210474600591452e+00 -2.3307737663036989e+00 -2.6810090164064904e+00
+   56 -1.2621087881539996e+00  1.2649871800389936e+00 -1.8926645655164065e+00
+   57  4.5797130534627017e-01 -1.8664838112748443e+00 -7.9083437859934921e-02
+   58  1.8947653213983979e+00 -1.2696890878445555e+00  1.1075578262554551e+00
+   59  2.5504370125908418e+00  1.3745174666515754e+00 -1.4107516734366936e+00
+   60 -1.3722628021690451e+00 -4.7890295934897642e+00 -2.6044795996416212e+00
+   61  5.5504118471845243e-01  1.6638732597224004e+00  1.9102194016740037e-01
+   62 -2.7251879924223230e+00  2.0619085257720378e+00 -1.0935823576664374e+00
+   63  1.2081746380050724e+00 -3.4948111236780761e+00  1.2168350620299762e-01
+   64  1.9860051556122582e+00  2.0927257690827177e+00  5.0697148025383179e+00
+run_vdwl: -102.35973334868544
+run_coul: 0
+run_stress: ! |-
+  -1.9328658030947810e+01 -1.8780379209045268e+01 -1.9346617077044971e+01 -9.9749666938435588e+00  2.8008437891372573e+01  2.7085184376387343e+00
+run_forces: ! |2
+    1 -8.3973414156161930e-01  2.6024806716939164e+00  1.6294489541587445e+00
+    2 -2.2332320674802562e+00 -8.3325867561653455e-01 -2.2006491222340316e+00
+    3  4.0201044172486750e-01 -4.2917790666562572e-01 -4.6948321345343524e-01
+    4 -1.5508368285261567e+00  2.6001283545145055e+00  1.4866238736909396e+00
+    5 -7.0283057803506777e-01 -1.0755813727224737e+00  1.1966810285587803e-01
+    6  1.6014945801277216e+00  3.1449315086117622e+00  2.6049037731194224e-01
+    7 -8.4515391346172386e-01 -2.9643376365236135e-01  5.5592808043879749e-01
+    8 -5.2718099427328857e-02  3.9189728118504363e-01 -2.5511789246187055e-01
+    9 -1.2645445343566819e-01 -2.1304729991357707e+00 -1.6522394141178478e+00
+   10  2.0170941449333582e-01 -1.3518767924960227e+00 -2.0322937623603781e+00
+   11  2.8651919448969720e+00 -3.1544674167965900e+00  1.7604480863567769e+00
+   12 -5.1795367535891712e+00 -3.1098019791862668e+00 -2.5538057436240353e+00
+   13 -9.3449291860999784e-01  5.1182059829377859e+00 -4.0298131228021977e-02
+   14 -1.3138769883562655e+00  3.2334706395375243e+00 -2.8705931823998737e-01
+   15 -3.0202933407782986e+00  2.8717678204148385e+00 -2.3669275791353264e+00
+   16  1.2183776352258253e+00  8.9127075127665278e-01  4.9384997662616144e+00
+   17  5.1004773227753764e-01  2.5996399633357181e+00 -3.6689206261060656e+00
+   18  7.7809167896759790e-01  2.1207144945670109e-01  3.0609488662886752e+00
+   19 -1.3130275117681316e+00 -2.3667134933940694e+00 -5.8981658972312290e-01
+   20 -5.5117978458785322e+00  4.2744430925502919e-01  7.6917505422068522e-01
+   21  1.9975555296768055e+00  3.3703612298365648e-01 -2.0175086154709017e+00
+   22 -5.3878913771259942e+00  2.7669546071116780e+00 -3.4936856927711784e+00
+   23 -6.7183695072627359e-01  1.7273496394998133e+00  1.8671408697513008e+00
+   24  1.0274178552169215e+00  2.7558227820998842e+00  1.5882174839506549e+00
+   25  8.2283419787093992e-01 -1.5447852836097220e-01  1.4391797038202698e+00
+   26 -6.2156011410459711e-01 -8.2796420305524143e-01  3.8090402705185117e-01
+   27  1.7601339367218041e+00 -9.9125729448376576e-01  1.0180273847746450e+00
+   28  1.4388092815122282e+00  8.0674679694088813e-01  4.0876648999036522e+00
+   29 -1.7046413611169287e+00  3.7321569413737288e-01 -1.8989219949625276e+00
+   30 -1.2074990763593707e-01  1.9792500368063586e+00 -7.8166413542349944e-01
+   31 -7.7332943322336511e-01  1.2315391616416569e+00 -9.6239548238557426e-01
+   32 -2.9277390618142722e+00 -2.5192574067961049e-01 -1.3037200813587058e+00
+   33  4.0477554035224355e+00 -2.1635774257779952e+00  2.3187668986584278e+00
+   34 -5.0676702793740924e-01 -5.1042867431712269e-01  4.4519935389270182e-01
+   35  6.5147883642558269e-01 -4.0559353423839967e-01  2.2844315670896309e+00
+   36  3.9390278045807725e-01 -1.9447574184680256e+00 -2.9115859619763695e-01
+   37 -1.1969835780313667e+00  1.5002434334384103e+00 -2.0143490185568584e+00
+   38  8.0676971272455089e-01 -6.3075876472442627e-01  2.2352527291923852e-01
+   39  9.4831126179927472e-01 -8.6162603556527295e-01 -3.1064360157753512e+00
+   40  2.8987316294754262e+00 -1.0047866832940338e+00 -9.6394435733480410e-01
+   41  2.6960650365945771e-01  9.2011029766347563e-02 -3.5329070420464370e-01
+   42 -3.2202934703067982e+00 -1.4801433334214229e+00 -3.4640704444773640e-01
+   43  5.4054911532485972e-01 -4.1773027513600249e+00 -2.1447817794737201e-01
+   44  2.9630449457388348e+00 -3.6072732024558061e+00  1.1944777942823346e+00
+   45  1.5144749481598813e-01  2.0033822312972123e+00 -2.2719544688472020e+00
+   46  2.9049655713441491e-01 -6.7867224645607704e-01  3.3387072833091955e+00
+   47  1.5754348928781643e+00  4.8101158180913595e-01  1.7080091216663373e+00
+   48 -7.7888189794370732e-02  1.9324741091956470e+00 -8.5274908698085294e-02
+   49 -7.9019684006595448e-01 -4.2648229814895942e+00 -2.4185384574896021e+00
+   50  1.4845949202981235e-01  2.1194430145066492e+00 -1.0924025593259503e+00
+   51 -1.0531897227887335e+00  3.4601296405944799e+00  2.6236602237569762e+00
+   52  3.3021855015483581e+00 -4.1172000553476273e+00  4.4848689387827374e+00
+   53  2.0437376661991382e+00  2.1726639093433144e+00  6.5674814327221342e-01
+   54  2.2800687253756191e+00 -1.7058539676012388e+00 -1.2353352851426578e+00
+   55  1.4138080148068077e+00 -2.3311510292486584e+00 -2.6435843933533452e+00
+   56 -1.2611495893710638e+00  1.2465439472263604e+00 -1.8894879732201839e+00
+   57  4.5173711499993979e-01 -1.8928474410809497e+00 -8.2753356735880912e-02
+   58  1.8971660771741441e+00 -1.2504810390672687e+00  1.0864490717872513e+00
+   59  2.5718616964408740e+00  1.3890721660300731e+00 -1.4168830433218909e+00
+   60 -1.4083227015617696e+00 -4.8165606065577791e+00 -2.6397173641387988e+00
+   61  5.6959342807188396e-01  1.6753227696902677e+00  2.0308480954757957e-01
+   62 -2.7207823332499634e+00  2.0545577417761192e+00 -1.0816801951719224e+00
+   63  1.2029071251954688e+00 -3.5148585644551131e+00  9.6902669182167084e-02
+   64  2.0245788952513477e+00  2.1340267730573288e+00  5.0949866359832177e+00
+...
diff --git a/unittest/force-styles/tests/manybody-pair-tersoff_table.yaml b/unittest/force-styles/tests/manybody-pair-tersoff_table.yaml
index 036acc25d1..06b27e874f 100644
--- a/unittest/force-styles/tests/manybody-pair-tersoff_table.yaml
+++ b/unittest/force-styles/tests/manybody-pair-tersoff_table.yaml
@@ -1,7 +1,8 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow, unstable
 date_generated: Fri Feb 26 23:09:20 2021
-epsilon: 5e-11
+epsilon: 9e-10
 prerequisites: ! |
   pair tersoff/table
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/mol-pair-lj_charmm_coul_long_soft.yaml b/unittest/force-styles/tests/mol-pair-lj_charmm_coul_long_soft.yaml
index 955d36491b..ddc944f84b 100644
--- a/unittest/force-styles/tests/mol-pair-lj_charmm_coul_long_soft.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_charmm_coul_long_soft.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:08:45 2021
-epsilon: 2.5e-12
+epsilon: 5e-12
 prerequisites: ! |
   atom full
   pair lj/charmm/coul/long/soft
diff --git a/unittest/force-styles/tests/mol-pair-lj_charmm_coul_msm.yaml b/unittest/force-styles/tests/mol-pair-lj_charmm_coul_msm.yaml
index 04670511d0..c374ba6631 100644
--- a/unittest/force-styles/tests/mol-pair-lj_charmm_coul_msm.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_charmm_coul_msm.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:08:46 2021
-epsilon: 5e-14
+epsilon: 6e-14
 prerequisites: ! |
   atom full
   pair lj/charmm/coul/msm
diff --git a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml
index db9f33fa87..70f45edc6f 100644
--- a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:08:48 2021
-epsilon: 2.5e-12
+epsilon: 5e-12
 prerequisites: ! |
   atom full
   pair lj/cut/coul/long/soft
diff --git a/unittest/force-styles/tests/mol-pair-lj_long_tip4p_long.yaml b/unittest/force-styles/tests/mol-pair-lj_long_tip4p_long.yaml
index 25ac675244..50a03799be 100644
--- a/unittest/force-styles/tests/mol-pair-lj_long_tip4p_long.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_long_tip4p_long.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:51 2021
 epsilon: 2.5e-09
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/mol-pair-lj_table_coul_table.yaml b/unittest/force-styles/tests/mol-pair-lj_table_coul_table.yaml
index c02f9a31dd..5d919a85e4 100644
--- a/unittest/force-styles/tests/mol-pair-lj_table_coul_table.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_table_coul_table.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:53 2021
 epsilon: 2.5e-09
 skip_tests: gpu
diff --git a/unittest/force-styles/tests/mol-pair-lj_table_tip4p_long.yaml b/unittest/force-styles/tests/mol-pair-lj_table_tip4p_long.yaml
index 7eab70a542..a1bd0e412b 100644
--- a/unittest/force-styles/tests/mol-pair-lj_table_tip4p_long.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_table_tip4p_long.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:54 2021
 epsilon: 2.5e-09
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/mol-pair-lj_table_tip4p_table.yaml b/unittest/force-styles/tests/mol-pair-lj_table_tip4p_table.yaml
index 8274c80cdc..1630d752df 100644
--- a/unittest/force-styles/tests/mol-pair-lj_table_tip4p_table.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_table_tip4p_table.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:54 2021
 epsilon: 2.5e-09
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/mol-pair-nm_cut_split.yaml b/unittest/force-styles/tests/mol-pair-nm_cut_split.yaml
new file mode 100644
index 0000000000..26fab2b1da
--- /dev/null
+++ b/unittest/force-styles/tests/mol-pair-nm_cut_split.yaml
@@ -0,0 +1,104 @@
+---
+lammps_version: 27 Oct 2021
+date_generated: Fri Dec  3 14:19:54 2021
+epsilon: 5e-13
+skip_tests:
+prerequisites: ! |
+  atom full
+  pair nm/cut/split
+pre_commands: ! ""
+post_commands: ! |
+  pair_modify mix arithmetic
+input_file: in.fourmol
+pair_style: nm/cut/split 8.0
+pair_coeff: ! |
+  1 1 0.02 2.5 12.0 6.0
+  1 2 0.01 1.75 9.0 6.0
+  1 3 0.02 2.85 12.0 6.0 8.0
+  1 4 0.0173205 2.8 12.0 6.0
+  1 5 0.0173205 2.8 9.0 6.0 7.0
+  2 2 0.005 1 10.0 8.0
+  2 3 0.01 2.1 12.0 6.0
+  2 4 0.005 0.5 12.0 6.0
+  2 5 0.00866025 2.05 12.0 6.0
+  3 3 0.02 3.2 12.0 6.0
+  3 4 0.0173205 3.15 12.0 6.0
+  3 5 0.0173205 3.15 12.0 6.0
+  4 4 0.015 3.1 12.0 6.0
+  4 5 0.015 3.1 12.0 6.0
+  5 5 0.015 3.1 12.0 6.0
+extract: ! |
+  e0 2
+  r0 2
+  nn 2
+  mm 2
+natoms: 29
+init_vdwl: 184.46055463786212
+init_coul: 0
+init_stress: ! |2-
+   5.3838190519561294e+02  5.4370888866042401e+02  1.1549423729943153e+03 -1.8851357922205918e+02  4.4535538593228559e+00  1.6572955388078447e+02
+init_forces: ! |2
+    1 -5.8010753095576453e+00  6.7019475304167301e+01  8.2636805547527302e+01
+    2  3.9312055883575447e+01  3.2350311908851069e+01 -4.6269491919818336e+01
+    3 -3.5305111899706318e+01 -9.6504324302794672e+01 -3.5088931543566190e+01
+    4 -5.8409040695096015e-01  1.5448113553260867e-01 -4.1551985279380638e-01
+    5 -2.0046651942407759e-01 -3.4223055809276309e-01  8.7871234802730946e-01
+    6 -2.0721364706498824e+02  2.4016237943042879e+02  2.8700056551899644e+02
+    7  1.4604553825101560e+01 -8.3802360854506233e+01 -4.2794566606285696e+02
+    8  3.6138770853796672e+01 -2.7258004708576987e+01  9.9538777711653978e+01
+    9  1.9677324629694770e+01  2.1192619795334007e+01  8.7076054788266745e+01
+   10  1.3187239009715952e+02 -1.5291386183130950e+02 -4.7328679849884615e+01
+   11 -1.6817605988576226e-01 -4.2337076133540452e-01 -6.9379475746613284e-01
+   12  4.9439004425924278e+00  3.5972141803133999e+00 -2.9063624045380081e+00
+   13  6.0847461884614340e-01 -2.4087582512536920e-01 -1.1429443490171888e-02
+   14 -2.5135966919090386e-01  5.0693343437559095e-02 -6.4882033213846146e-01
+   15 -1.7492135130621179e-02  6.3261129633870505e-01  2.2568696677395533e-01
+   16  1.1533643706799870e+02 -8.2319682718269064e+01 -2.9635546568323309e+02
+   17 -1.1295230473793292e+02  7.8645099709581359e+01  3.0030745767603463e+02
+   18 -2.2515034468785730e-03 -4.3333792383380811e-03  9.9435889422212678e-03
+   19 -3.8285183184657459e-03 -2.3670865665501185e-03 -5.1320932088365729e-03
+   20  5.9936054839043167e-03  6.4927254507248788e-03 -4.5957636846263574e-03
+   21 -1.7771101916350986e+01 -2.0266633257241402e+01  5.6070531525240284e+01
+   22 -2.6828196427086496e+01 -6.5013719610173153e+00 -4.2091321161763851e+01
+   23  4.4599242317851683e+01  2.6768079222089987e+01 -1.3979148411868403e+01
+   24  9.0731337276239099e+00 -5.2584048025594143e+01  2.7844998894928324e+01
+   25 -3.6857511179337180e+01  5.9339663466847563e+00 -3.0985670760400872e+01
+   26  2.7784301129627654e+01  4.6650009976909971e+01  3.1405189423721294e+00
+   27  1.2878799997680169e+01 -5.6365384292184224e+01  2.2564079535768347e+01
+   28 -4.4799160574539883e+01  1.9253173801592030e+01 -3.0311698334899067e+01
+   29  3.1920395724814831e+01  3.7112241385139740e+01  7.7475953310800270e+00
+run_vdwl: 182.54322998463735
+run_coul: 0
+run_stress: ! |2-
+   5.3683122198288640e+02  5.4178378802457769e+02  1.1353333493010725e+03 -1.8652393510886540e+02  5.7315943542766883e+00  1.6367637460475797e+02
+run_forces: ! |2
+    1 -5.4985800660379427e+00  6.6860137696531524e+01  8.1930199058115377e+01
+    2  3.8943763339079162e+01  3.2101186066798043e+01 -4.5685663412455682e+01
+    3 -3.5268884048814805e+01 -9.6070477131159592e+01 -3.4950962102053076e+01
+    4 -5.8110788445779737e-01  1.5332747558659973e-01 -4.1441556892310077e-01
+    5 -1.9983485643709664e-01 -3.4132412196311207e-01  8.7651795021384826e-01
+    6 -2.0475707877718284e+02  2.3656413731867255e+02  2.7690406065284441e+02
+    7  1.4365054876631929e+01 -8.1840771957943403e+01 -4.1598454486600491e+02
+    8  3.4383083802427983e+01 -2.5716756385026692e+01  9.8544673145606026e+01
+    9  1.9493740607077331e+01  2.0908163042484368e+01  8.6173131559856557e+01
+   10  1.3162712158663410e+02 -1.5256247809513576e+02 -4.7318022665875795e+01
+   11 -1.6663043747916428e-01 -4.1842665602207779e-01 -6.8596525639165273e-01
+   12  4.9386412500607531e+00  3.5930375471155727e+00 -2.9133311189121791e+00
+   13  6.0592613726991740e-01 -2.3863451246412490e-01 -1.1243947616635162e-02
+   14 -2.4881617601430056e-01  4.9256493847659046e-02 -6.4206555723776182e-01
+   15 -1.8864672034612142e-02  6.3400103620870663e-01  2.2749995602389267e-01
+   16  1.1368074742380061e+02 -8.1314893874686348e+01 -2.9210190496613490e+02
+   17 -1.1129809835055249e+02  7.7640690527244871e+01  2.9605193568469781e+02
+   18 -1.8467319685461141e-03 -3.9327680307163632e-03  9.7963964214631150e-03
+   19 -3.9239527854912802e-03 -2.4372204873194495e-03 -5.2255474424068920e-03
+   20  5.6841881117434614e-03  6.1621918750248477e-03 -4.3549955575843610e-03
+   21 -1.7911431237850785e+01 -2.0223922929187104e+01  5.6099852457401404e+01
+   22 -2.6930668338346326e+01 -6.5737840991111369e+00 -4.2110517739316450e+01
+   23  4.4842043520244751e+01  2.6797781095153024e+01 -1.3989272750175145e+01
+   24  9.5726630373688231e+00 -5.3267309313865724e+01  2.8451784044206295e+01
+   25 -3.7722233711284453e+01  5.9931042726955770e+00 -3.1723790363641090e+01
+   26  2.8149494352154065e+01  4.7274133430564625e+01  3.2718534005530699e+00
+   27  1.3025430772458886e+01 -5.6434742994946994e+01  2.2522352857211565e+01
+   28 -4.4933173873492848e+01  1.9289121683036299e+01 -3.0342495179704237e+01
+   29  3.1907778221419438e+01  3.7145652182215706e+01  7.8201188742906407e+00
+...
diff --git a/unittest/force-styles/tests/mol-pair-table.yaml b/unittest/force-styles/tests/mol-pair-table.yaml
index 2d0ff80bd3..fb9e647677 100644
--- a/unittest/force-styles/tests/mol-pair-table.yaml
+++ b/unittest/force-styles/tests/mol-pair-table.yaml
@@ -1,5 +1,6 @@
 ---
 lammps_version: 10 Feb 2021
+tags: slow
 date_generated: Fri Feb 26 23:08:56 2021
 epsilon: 5e-14
 prerequisites: ! |
diff --git a/unittest/formats/CMakeLists.txt b/unittest/formats/CMakeLists.txt
index 93b48ac1b4..be8e055adb 100644
--- a/unittest/formats/CMakeLists.txt
+++ b/unittest/formats/CMakeLists.txt
@@ -1,48 +1,48 @@
 
 add_executable(test_atom_styles test_atom_styles.cpp)
-target_link_libraries(test_atom_styles PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_atom_styles PRIVATE lammps GTest::GMock)
 add_test(NAME AtomStyles COMMAND test_atom_styles WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_image_flags test_image_flags.cpp)
-target_link_libraries(test_image_flags PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_image_flags PRIVATE lammps GTest::GMock)
 add_test(NAME ImageFlags COMMAND test_image_flags WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_input_convert test_input_convert.cpp)
-target_link_libraries(test_input_convert PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_input_convert PRIVATE lammps GTest::GMockMain)
 add_test(NAME InputConvert COMMAND test_input_convert WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_molecule_file test_molecule_file.cpp)
-target_link_libraries(test_molecule_file PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_molecule_file PRIVATE lammps GTest::GMock)
 add_test(NAME MoleculeFile COMMAND test_molecule_file WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_pair_unit_convert test_pair_unit_convert.cpp)
-target_link_libraries(test_pair_unit_convert PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_pair_unit_convert PRIVATE lammps GTest::GMock)
 add_test(NAME PairUnitConvert COMMAND test_pair_unit_convert WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(PairUnitConvert PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 add_executable(test_potential_file_reader test_potential_file_reader.cpp)
-target_link_libraries(test_potential_file_reader PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_potential_file_reader PRIVATE lammps GTest::GMock)
 add_test(NAME PotentialFileReader COMMAND test_potential_file_reader WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(PotentialFileReader PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 if(PKG_MANYBODY)
     add_executable(test_eim_potential_file_reader test_eim_potential_file_reader.cpp)
-    target_link_libraries(test_eim_potential_file_reader PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_eim_potential_file_reader PRIVATE lammps GTest::GMock)
     add_test(NAME EIMPotentialFileReader COMMAND test_eim_potential_file_reader WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     set_tests_properties(EIMPotentialFileReader PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 endif()
 
 add_executable(test_text_file_reader test_text_file_reader.cpp)
-target_link_libraries(test_text_file_reader PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_text_file_reader PRIVATE lammps GTest::GMock)
 add_test(NAME TextFileReader COMMAND test_text_file_reader WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(TextFileReader PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 add_executable(test_file_operations test_file_operations.cpp)
-target_link_libraries(test_file_operations PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_file_operations PRIVATE lammps GTest::GMock)
 add_test(NAME FileOperations COMMAND test_file_operations WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_executable(test_dump_atom test_dump_atom.cpp)
-target_link_libraries(test_dump_atom PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_dump_atom PRIVATE lammps GTest::GMock)
 add_test(NAME DumpAtom COMMAND test_dump_atom WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(DumpAtom PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
@@ -50,19 +50,19 @@ if(PKG_COMPRESS)
     find_program(GZIP_BINARY NAMES gzip REQUIRED)
 
     add_executable(test_dump_atom_compressed test_dump_atom_compressed.cpp compressed_dump_test_main.cpp)
-    target_link_libraries(test_dump_atom_compressed PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_dump_atom_compressed PRIVATE lammps GTest::GMock)
 
     add_executable(test_dump_custom_compressed test_dump_custom_compressed.cpp compressed_dump_test_main.cpp)
-    target_link_libraries(test_dump_custom_compressed PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_dump_custom_compressed PRIVATE lammps GTest::GMock)
 
     add_executable(test_dump_cfg_compressed test_dump_cfg_compressed.cpp compressed_dump_test_main.cpp)
-    target_link_libraries(test_dump_cfg_compressed PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_dump_cfg_compressed PRIVATE lammps GTest::GMock)
 
     add_executable(test_dump_local_compressed test_dump_local_compressed.cpp compressed_dump_test_main.cpp)
-    target_link_libraries(test_dump_local_compressed PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_dump_local_compressed PRIVATE lammps GTest::GMock)
 
     add_executable(test_dump_xyz_compressed test_dump_xyz_compressed.cpp compressed_dump_test_main.cpp)
-    target_link_libraries(test_dump_xyz_compressed PRIVATE lammps GTest::GMock GTest::GTest)
+    target_link_libraries(test_dump_xyz_compressed PRIVATE lammps GTest::GMock)
 
     add_test(NAME DumpAtomGZ COMMAND test_dump_atom_compressed gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     set_tests_properties(DumpAtomGZ PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};COMPRESS_BINARY=${GZIP_BINARY}")
@@ -102,20 +102,30 @@ if(PKG_COMPRESS)
 endif()
 
 add_executable(test_dump_custom test_dump_custom.cpp)
-target_link_libraries(test_dump_custom PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_dump_custom PRIVATE lammps GTest::GMock)
 add_test(NAME DumpCustom COMMAND test_dump_custom WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(DumpCustom PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 add_executable(test_dump_cfg test_dump_cfg.cpp)
-target_link_libraries(test_dump_cfg PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_dump_cfg PRIVATE lammps GTest::GMock)
 add_test(NAME DumpCfg COMMAND test_dump_cfg WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(DumpCfg PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
 add_executable(test_dump_local test_dump_local.cpp)
-target_link_libraries(test_dump_local PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_dump_local PRIVATE lammps GTest::GMock)
 add_test(NAME DumpLocal COMMAND test_dump_local WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(DumpLocal PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
 
+if(PKG_NETCDF)
+  find_program(NCDUMP NAMES ncdump ncdump.exe)
+  add_executable(test_dump_netcdf test_dump_netcdf.cpp)
+  target_link_libraries(test_dump_netcdf PRIVATE lammps GTest::GMock)
+  add_test(NAME DumpNetCDF COMMAND test_dump_netcdf WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  if(NOT (NCDUMP STREQUAL "NCDUMP-NOTFOUND"))
+    set_tests_properties(DumpNetCDF PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR};NCDUMP_BINARY=${NCDUMP}")
+  endif()
+endif()
+
 if(BUILD_TOOLS)
     set_tests_properties(DumpAtom PROPERTIES ENVIRONMENT "BINARY2TXT_BINARY=$<TARGET_FILE:binary2txt>")
     set_tests_properties(DumpCustom PROPERTIES ENVIRONMENT "BINARY2TXT_BINARY=$<TARGET_FILE:binary2txt>")
diff --git a/unittest/formats/test_dump_atom.cpp b/unittest/formats/test_dump_atom.cpp
index bc43c40de2..a73204fb92 100644
--- a/unittest/formats/test_dump_atom.cpp
+++ b/unittest/formats/test_dump_atom.cpp
@@ -74,6 +74,13 @@ public:
         END_HIDE_OUTPUT();
     }
 
+    void close_dump()
+    {
+        BEGIN_HIDE_OUTPUT();
+        command("undump id");
+        END_HIDE_OUTPUT();
+    }
+
     void generate_text_and_binary_dump(std::string text_file, std::string binary_file,
                                        std::string dump_modify_options, int ntimesteps)
     {
@@ -505,6 +512,7 @@ TEST_F(DumpAtomTest, rerun)
     ASSERT_FILE_EXISTS(dump_file);
     ASSERT_EQ(count_lines(dump_file), 82);
     continue_dump(1);
+    close_dump();
     lmp->output->thermo->evaluate_keyword("pe", &pe_2);
     ASSERT_FILE_EXISTS(dump_file);
     ASSERT_EQ(count_lines(dump_file), 123);
@@ -521,6 +529,33 @@ TEST_F(DumpAtomTest, rerun)
     delete_file(dump_file);
 }
 
+TEST_F(DumpAtomTest, rerun_bin)
+{
+    auto dump_file = binary_dump_filename("rerun");
+    HIDE_OUTPUT([&] {
+        command("fix 1 all nve");
+    });
+    generate_dump(dump_file, "", 1);
+    double pe_1, pe_2, pe_rerun;
+    lmp->output->thermo->evaluate_keyword("pe", &pe_1);
+    ASSERT_FILE_EXISTS(dump_file);
+    continue_dump(1);
+    close_dump();
+    lmp->output->thermo->evaluate_keyword("pe", &pe_2);
+    ASSERT_FILE_EXISTS(dump_file);
+    HIDE_OUTPUT([&] {
+        command(fmt::format("rerun {} first 1 last 1 every 1 post no dump x y z", dump_file));
+    });
+    lmp->output->thermo->evaluate_keyword("pe", &pe_rerun);
+    ASSERT_NEAR(pe_1, pe_rerun,1.0e-14);
+    HIDE_OUTPUT([&] {
+        command(fmt::format("rerun {} first 2 last 2 every 1 post yes dump x y z", dump_file));
+    });
+    lmp->output->thermo->evaluate_keyword("pe", &pe_rerun);
+    ASSERT_NEAR(pe_2, pe_rerun,1.0e-14);
+    delete_file(dump_file);
+}
+
 TEST_F(DumpAtomTest, multi_file_run1)
 {
     auto dump_file = dump_filename("run1_*");
diff --git a/unittest/formats/test_dump_custom.cpp b/unittest/formats/test_dump_custom.cpp
index 5d4132108d..434acf462c 100644
--- a/unittest/formats/test_dump_custom.cpp
+++ b/unittest/formats/test_dump_custom.cpp
@@ -73,6 +73,13 @@ public:
         END_HIDE_OUTPUT();
     }
 
+    void close_dump()
+    {
+        BEGIN_HIDE_OUTPUT();
+        command("undump id");
+        END_HIDE_OUTPUT();
+    }
+
     void generate_text_and_binary_dump(std::string text_file, std::string binary_file,
                                        std::string fields, std::string dump_modify_options,
                                        int ntimesteps)
@@ -330,6 +337,7 @@ TEST_F(DumpCustomTest, rerun)
     ASSERT_FILE_EXISTS(dump_file);
     ASSERT_EQ(count_lines(dump_file), 82);
     continue_dump(1);
+    close_dump();
     lmp->output->thermo->evaluate_keyword("pe", &pe_2);
     ASSERT_FILE_EXISTS(dump_file);
     ASSERT_EQ(count_lines(dump_file), 123);
@@ -338,6 +346,7 @@ TEST_F(DumpCustomTest, rerun)
     });
     lmp->output->thermo->evaluate_keyword("pe", &pe_rerun);
     ASSERT_DOUBLE_EQ(pe_1, pe_rerun);
+    
     HIDE_OUTPUT([&] {
         command(fmt::format("rerun {} first 2 last 2 every 1 post yes dump x y z", dump_file));
     });
@@ -346,6 +355,35 @@ TEST_F(DumpCustomTest, rerun)
     delete_file(dump_file);
 }
 
+TEST_F(DumpCustomTest, rerun_bin)
+{
+    auto dump_file = binary_dump_filename("rerun");
+    auto fields    = "id type xs ys zs";
+
+    HIDE_OUTPUT([&] {
+        command("fix 1 all nve");
+    });
+    generate_dump(dump_file, fields, "", 1);
+    double pe_1, pe_2, pe_rerun;
+    lmp->output->thermo->evaluate_keyword("pe", &pe_1);
+    ASSERT_FILE_EXISTS(dump_file);
+    continue_dump(1);
+    close_dump();
+    lmp->output->thermo->evaluate_keyword("pe", &pe_2);
+    ASSERT_FILE_EXISTS(dump_file);
+    HIDE_OUTPUT([&] {
+        command(fmt::format("rerun {} first 1 last 1 every 1 post no dump x y z", dump_file));
+    });
+    lmp->output->thermo->evaluate_keyword("pe", &pe_rerun);
+    ASSERT_NEAR(pe_1, pe_rerun,1.0e-14);
+    HIDE_OUTPUT([&] {
+        command(fmt::format("rerun {} first 2 last 2 every 1 post yes dump x y z", dump_file));
+    });
+    lmp->output->thermo->evaluate_keyword("pe", &pe_rerun);
+    ASSERT_NEAR(pe_2, pe_rerun,1.0e-14);
+    delete_file(dump_file);
+}
+
 int main(int argc, char **argv)
 {
     MPI_Init(&argc, &argv);
diff --git a/unittest/formats/test_dump_netcdf.cpp b/unittest/formats/test_dump_netcdf.cpp
new file mode 100644
index 0000000000..8b4110b352
--- /dev/null
+++ b/unittest/formats/test_dump_netcdf.cpp
@@ -0,0 +1,412 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "../testing/core.h"
+#include "../testing/systems/melt.h"
+#include "../testing/utils.h"
+#include "fmt/format.h"
+#include "library.h"
+#include "output.h"
+#include "thermo.h"
+#include "utils.h"
+#include "version.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <string>
+
+using ::testing::Eq;
+
+char *NCDUMP_BINARY = nullptr;
+bool verbose        = false;
+
+class DumpNetCDFTest : public MeltTest {
+    std::string dump_style = "netcdf";
+
+public:
+    void set_style(const std::string &new_style) { dump_style = new_style; }
+
+    void enable_triclinic()
+    {
+        BEGIN_HIDE_OUTPUT();
+        command("change_box all triclinic");
+        END_HIDE_OUTPUT();
+    }
+
+    std::string dump_filename(std::string ident)
+    {
+        return fmt::format("dump_{}_{}.nc", dump_style, ident);
+    }
+
+    void generate_dump(std::string dump_file, std::string fields, std::string dump_modify_options,
+                       int ntimesteps)
+    {
+        BEGIN_HIDE_OUTPUT();
+        command(fmt::format("dump id all {} 1 {} {}", dump_style, dump_file, fields));
+
+        if (!dump_modify_options.empty()) {
+            command(fmt::format("dump_modify id {}", dump_modify_options));
+        }
+
+        command(fmt::format("run {} post no", ntimesteps));
+        END_HIDE_OUTPUT();
+    }
+
+    void continue_dump(int ntimesteps)
+    {
+        BEGIN_HIDE_OUTPUT();
+        command(fmt::format("run {} pre no post no", ntimesteps));
+        END_HIDE_OUTPUT();
+    }
+
+    void close_dump()
+    {
+        BEGIN_HIDE_OUTPUT();
+        command("undump id");
+        END_HIDE_OUTPUT();
+    }
+
+    std::string convert_binary_to_text(std::string binary_file)
+    {
+        BEGIN_HIDE_OUTPUT();
+        std::string cmdline = fmt::format("{0} {1} > {1}.txt", NCDUMP_BINARY, binary_file);
+        system(cmdline.c_str());
+        END_HIDE_OUTPUT();
+        return fmt::format("{}.txt", binary_file);
+    }
+};
+
+TEST_F(DumpNetCDFTest, run0_plain)
+{
+    if (!lammps_has_style(lmp, "dump", "netcdf")) GTEST_SKIP();
+    auto dump_file = dump_filename("run0");
+    auto fields    = "id type proc procp1 mass x y z ix iy iz xu yu zu vx vy vz fx fy fz";
+    set_style("netcdf");
+    generate_dump(dump_file, fields, "", 0);
+
+    ASSERT_FILE_EXISTS(dump_file);
+    if (NCDUMP_BINARY) {
+        auto converted_file = convert_binary_to_text(dump_file);
+        auto lines          = read_lines(converted_file);
+        auto header         = utils::split_words(lines[0]);
+        ASSERT_EQ(lines.size(), 233);
+        ASSERT_THAT(header[0], Eq("netcdf"));
+        ASSERT_THAT(header[1] + ".nc", Eq(dump_file));
+
+        // check dimensions section
+        auto section = std::find(lines.begin(), lines.end(), "dimensions:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if ((words.size() < 1) || (words[0] == "variables:")) break;
+            if (words[0] == "atom") ASSERT_THAT(words[2], Eq("32"));
+            if (words[0] == "label") ASSERT_THAT(words[2], Eq("10"));
+            if (words[0] == "Voigt") ASSERT_THAT(words[2], Eq("6"));
+            if (words[0] == "spatial") ASSERT_THAT(words[2], Eq("3"));
+        }
+
+        // check variables section
+        section = std::find(lines.begin(), lines.end(), "variables:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if ((words.size() < 2) || (words[0] == "data:")) break;
+            if (words[0] == "time:units") ASSERT_THAT(words[2], Eq("lj"));
+            if (words[0] == "time:scale_factor") ASSERT_THAT(words[2], Eq("0.005f"));
+            if (words[0] == "cell_origin:units") ASSERT_THAT(words[2], Eq("lj"));
+            if (words[0] == "cell_angles:units") ASSERT_THAT(words[2], Eq("degree"));
+            if (words[1] == "id(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "type(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "proc(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "procp1(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "mass(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "ix(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "iy(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "iz(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[0] == ":Conventions") ASSERT_THAT(words[2], Eq("AMBER"));
+            if (words[0] == ":ConventionVersion") ASSERT_THAT(words[2], Eq("1.0"));
+            if (words[0] == ":program") ASSERT_THAT(words[2], Eq("LAMMPS"));
+            if (words[0] == ":programVersion") ASSERT_THAT(words[2], Eq(LAMMPS_VERSION));
+        }
+
+        // check data section
+        section = std::find(lines.begin(), lines.end(), "data:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if (words.size() > 0) {
+                if (words[0] == "spatial") ASSERT_THAT(words[2], Eq("xyz"));
+                if (words[0] == "cell_spatial") ASSERT_THAT(words[2], Eq("abc"));
+                if (words[0] == "cell_origin") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0"));
+                }
+                if (words[0] == "cell_lengths") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("3.359192,"));
+                    ASSERT_THAT(words[1], Eq("3.359192,"));
+                    ASSERT_THAT(words[2], Eq("3.359192"));
+                }
+                if (words[0] == "cell_angles") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("90,"));
+                    ASSERT_THAT(words[1], Eq("90,"));
+                    ASSERT_THAT(words[2], Eq("90"));
+                }
+                if (words[0] == "id") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("1,"));
+                    ASSERT_THAT(words[1], Eq("2,"));
+                    ASSERT_THAT(words[2], Eq("3,"));
+                    ASSERT_THAT(words[3], Eq("4,"));
+                    ASSERT_THAT(words[4], Eq("5,"));
+                    ASSERT_THAT(words[5], Eq("6,"));
+                    ASSERT_THAT(words[6], Eq("7,"));
+                    ASSERT_THAT(words[7], Eq("8,"));
+                    ASSERT_THAT(words[8], Eq("9,"));
+                    ASSERT_THAT(words[9], Eq("10,"));
+                }
+                if (words[0] == "mass") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("1,"));
+                    ASSERT_THAT(words[1], Eq("1,"));
+                    ASSERT_THAT(words[2], Eq("1,"));
+                    ASSERT_THAT(words[3], Eq("1,"));
+                    ASSERT_THAT(words[4], Eq("1,"));
+                    ASSERT_THAT(words[5], Eq("1,"));
+                    ASSERT_THAT(words[6], Eq("1,"));
+                    ASSERT_THAT(words[7], Eq("1,"));
+                    ASSERT_THAT(words[8], Eq("1,"));
+                    ASSERT_THAT(words[9], Eq("1,"));
+                }
+                if (words[0] == "coordinates") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0.8397981,"));
+                    ASSERT_THAT(words[1], Eq("0.8397981,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0.8397981,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0.8397981,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0.8397981,"));
+                    ASSERT_THAT(words[2], Eq("0.8397981,"));
+                }
+                if (words[0] == "ix") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ASSERT_THAT(words[3], Eq("0,"));
+                    ASSERT_THAT(words[4], Eq("0,"));
+                    ASSERT_THAT(words[5], Eq("0,"));
+                    ASSERT_THAT(words[6], Eq("0,"));
+                    ASSERT_THAT(words[7], Eq("0,"));
+                    ASSERT_THAT(words[8], Eq("0,"));
+                    ASSERT_THAT(words[9], Eq("0,"));
+                }
+            }
+        }
+        delete_file(converted_file);
+    }
+    delete_file(dump_file);
+}
+
+TEST_F(DumpNetCDFTest, run0_mpi)
+{
+    if (!lammps_has_style(lmp, "dump", "netcdf/mpiio")) GTEST_SKIP();
+    auto dump_file = dump_filename("mpi0");
+    auto fields    = "id type proc procp1 mass x y z ix iy iz xu yu zu vx vy vz fx fy fz";
+    set_style("netcdf/mpiio");
+    generate_dump(dump_file, fields, "", 0);
+
+    ASSERT_FILE_EXISTS(dump_file);
+    if (NCDUMP_BINARY) {
+        auto converted_file = convert_binary_to_text(dump_file);
+        auto lines          = read_lines(converted_file);
+        auto header         = utils::split_words(lines[0]);
+        ASSERT_EQ(lines.size(), 234);
+        ASSERT_THAT(header[0], Eq("netcdf"));
+        ASSERT_THAT(header[1] + ".nc", Eq(dump_file));
+
+        // check dimensions section
+        auto section = std::find(lines.begin(), lines.end(), "dimensions:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if ((words.size() < 1) || (words[0] == "variables:")) break;
+            if (words[0] == "atom") ASSERT_THAT(words[2], Eq("32"));
+            if (words[0] == "label") ASSERT_THAT(words[2], Eq("10"));
+            if (words[0] == "Voigt") ASSERT_THAT(words[2], Eq("6"));
+            if (words[0] == "spatial") ASSERT_THAT(words[2], Eq("3"));
+        }
+
+        // check variables section
+        section = std::find(lines.begin(), lines.end(), "variables:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if ((words.size() < 2) || (words[0] == "data:")) break;
+            if (words[0] == "time:units") ASSERT_THAT(words[2], Eq("lj"));
+            if (words[0] == "time:scale_factor") ASSERT_THAT(words[2], Eq("0.005f"));
+            if (words[0] == "cell_origin:units") ASSERT_THAT(words[2], Eq("lj"));
+            if (words[0] == "cell_angles:units") ASSERT_THAT(words[2], Eq("degree"));
+            if (words[1] == "id(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "type(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "proc(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "procp1(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "mass(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "ix(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "iy(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[1] == "iz(frame,") ASSERT_THAT(words[2], Eq("atom)"));
+            if (words[0] == ":Conventions") ASSERT_THAT(words[2], Eq("AMBER"));
+            if (words[0] == ":ConventionVersion") ASSERT_THAT(words[2], Eq("1.0"));
+            if (words[0] == ":program") ASSERT_THAT(words[2], Eq("LAMMPS"));
+            if (words[0] == ":programVersion") ASSERT_THAT(words[2], Eq(LAMMPS_VERSION));
+        }
+
+        // check data section
+        section = std::find(lines.begin(), lines.end(), "data:");
+        for (auto line = ++section; line < lines.end(); ++line) {
+            auto words = utils::split_words(*line);
+            if (words.size() > 0) {
+                if (words[0] == "spatial") ASSERT_THAT(words[2], Eq("xyz"));
+                if (words[0] == "cell_spatial") ASSERT_THAT(words[2], Eq("abc"));
+                if (words[0] == "cell_origin") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0"));
+                }
+                if (words[0] == "cell_lengths") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("3.359192,"));
+                    ASSERT_THAT(words[1], Eq("3.359192,"));
+                    ASSERT_THAT(words[2], Eq("3.359192"));
+                }
+                if (words[0] == "cell_angles") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("90,"));
+                    ASSERT_THAT(words[1], Eq("90,"));
+                    ASSERT_THAT(words[2], Eq("90"));
+                }
+                if (words[0] == "id") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("1,"));
+                    ASSERT_THAT(words[1], Eq("2,"));
+                    ASSERT_THAT(words[2], Eq("3,"));
+                    ASSERT_THAT(words[3], Eq("4,"));
+                    ASSERT_THAT(words[4], Eq("5,"));
+                    ASSERT_THAT(words[5], Eq("6,"));
+                    ASSERT_THAT(words[6], Eq("7,"));
+                    ASSERT_THAT(words[7], Eq("8,"));
+                    ASSERT_THAT(words[8], Eq("9,"));
+                    ASSERT_THAT(words[9], Eq("10,"));
+                }
+                if (words[0] == "mass") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("1,"));
+                    ASSERT_THAT(words[1], Eq("1,"));
+                    ASSERT_THAT(words[2], Eq("1,"));
+                    ASSERT_THAT(words[3], Eq("1,"));
+                    ASSERT_THAT(words[4], Eq("1,"));
+                    ASSERT_THAT(words[5], Eq("1,"));
+                    ASSERT_THAT(words[6], Eq("1,"));
+                    ASSERT_THAT(words[7], Eq("1,"));
+                    ASSERT_THAT(words[8], Eq("1,"));
+                    ASSERT_THAT(words[9], Eq("1,"));
+                }
+                if (words[0] == "coordinates") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0.8397981,"));
+                    ASSERT_THAT(words[1], Eq("0.8397981,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0.8397981,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0.8397981,"));
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0.8397981,"));
+                    ASSERT_THAT(words[2], Eq("0.8397981,"));
+                }
+                if (words[0] == "ix") {
+                    ++line;
+                    words = utils::split_words(*line);
+                    ASSERT_THAT(words[0], Eq("0,"));
+                    ASSERT_THAT(words[1], Eq("0,"));
+                    ASSERT_THAT(words[2], Eq("0,"));
+                    ASSERT_THAT(words[3], Eq("0,"));
+                    ASSERT_THAT(words[4], Eq("0,"));
+                    ASSERT_THAT(words[5], Eq("0,"));
+                    ASSERT_THAT(words[6], Eq("0,"));
+                    ASSERT_THAT(words[7], Eq("0,"));
+                    ASSERT_THAT(words[8], Eq("0,"));
+                    ASSERT_THAT(words[9], Eq("0,"));
+                }
+            }
+        }
+        delete_file(converted_file);
+    }
+    delete_file(dump_file);
+}
+
+int main(int argc, char **argv)
+{
+    MPI_Init(&argc, &argv);
+    ::testing::InitGoogleMock(&argc, argv);
+
+    // handle arguments passed via environment variable
+    if (const char *var = getenv("TEST_ARGS")) {
+        std::vector<std::string> env = utils::split_words(var);
+        for (auto arg : env) {
+            if (arg == "-v") {
+                verbose = true;
+            }
+        }
+    }
+
+    NCDUMP_BINARY = getenv("NCDUMP_BINARY");
+
+    if ((argc > 1) && (strcmp(argv[1], "-v") == 0)) verbose = true;
+
+    int rv = RUN_ALL_TESTS();
+    MPI_Finalize();
+    return rv;
+}
diff --git a/unittest/formats/test_input_convert.cpp b/unittest/formats/test_input_convert.cpp
index 0ff6878b13..5930824735 100644
--- a/unittest/formats/test_input_convert.cpp
+++ b/unittest/formats/test_input_convert.cpp
@@ -49,6 +49,15 @@ TEST_F(InputConvertTest, logical)
     EXPECT_EQ(utils::logical(FLERR, "off", false, lmp), 0);
     EXPECT_EQ(utils::logical(FLERR, "0", false, lmp), 0);
 
+    EXPECT_EQ(utils::logical(FLERR, std::string("yes"), false, lmp), 1);
+    EXPECT_EQ(utils::logical(FLERR, std::string("true"), false, lmp), 1);
+    EXPECT_EQ(utils::logical(FLERR, std::string("on"), false, lmp), 1);
+    EXPECT_EQ(utils::logical(FLERR, std::string("1"), false, lmp), 1);
+    EXPECT_EQ(utils::logical(FLERR, std::string("no"), false, lmp), 0);
+    EXPECT_EQ(utils::logical(FLERR, std::string("false"), false, lmp), 0);
+    EXPECT_EQ(utils::logical(FLERR, std::string("off"), false, lmp), 0);
+    EXPECT_EQ(utils::logical(FLERR, std::string("0"), false, lmp), 0);
+
     TEST_FAILURE(".*ERROR: Expected boolean parameter instead of.*",
                  utils::logical(FLERR, "YES", false, lmp););
     TEST_FAILURE(".*ERROR: Expected boolean parameter instead of.*",
@@ -94,6 +103,15 @@ TEST_F(InputConvertTest, numeric)
     EXPECT_DOUBLE_EQ(utils::numeric(FLERR, "10000000000", false, lmp), 1e10);
     EXPECT_DOUBLE_EQ(utils::numeric(FLERR, "2.56E+3", false, lmp), 2560);
 
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("0"), false, lmp), 0);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("0.1"), false, lmp), 0.1);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("-.232"), false, lmp), -0.232);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string(".2e5"), false, lmp), 20000.0);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("2.5e-10"), false, lmp), 2.5e-10);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("+0.3"), false, lmp), 0.3);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("10000000000"), false, lmp), 1e10);
+    EXPECT_DOUBLE_EQ(utils::numeric(FLERR, std::string("2.56E+3"), false, lmp), 2560);
+
     TEST_FAILURE(".*ERROR: Expected floating point.*", utils::numeric(FLERR, "yay", false, lmp););
     TEST_FAILURE(".*ERROR: Expected floating point.*", utils::numeric(FLERR, "", false, lmp););
     TEST_FAILURE(".*ERROR: Expected floating point.*", utils::numeric(FLERR, nullptr, false, lmp););
@@ -110,6 +128,13 @@ TEST_F(InputConvertTest, inumeric)
     EXPECT_EQ(utils::inumeric(FLERR, "-0", false, lmp), 0);
     EXPECT_EQ(utils::inumeric(FLERR, "0100", false, lmp), 100);
 
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("0"), false, lmp), 0);
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("-1"), false, lmp), -1);
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("10000"), false, lmp), 10000);
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("-532410"), false, lmp), -532410);
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("-0"), false, lmp), 0);
+    EXPECT_EQ(utils::inumeric(FLERR, std::string("0100"), false, lmp), 100);
+
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::inumeric(FLERR, "yay", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::inumeric(FLERR, "0.1", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::inumeric(FLERR, "1.1", false, lmp););
@@ -128,6 +153,13 @@ TEST_F(InputConvertTest, bnumeric)
     EXPECT_EQ(utils::bnumeric(FLERR, "-0", false, lmp), 0);
     EXPECT_EQ(utils::bnumeric(FLERR, "0100", false, lmp), 100);
 
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("0"), false, lmp), 0);
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("-1"), false, lmp), -1);
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("10000"), false, lmp), 10000);
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("-532410"), false, lmp), -532410);
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("-0"), false, lmp), 0);
+    EXPECT_EQ(utils::bnumeric(FLERR, std::string("0100"), false, lmp), 100);
+
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::bnumeric(FLERR, "yay", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::bnumeric(FLERR, "0.1", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::bnumeric(FLERR, "1.1", false, lmp););
@@ -146,6 +178,13 @@ TEST_F(InputConvertTest, tnumeric)
     EXPECT_EQ(utils::tnumeric(FLERR, "-0", false, lmp), 0);
     EXPECT_EQ(utils::tnumeric(FLERR, "0100", false, lmp), 100);
 
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("0"), false, lmp), 0);
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("-1"), false, lmp), -1);
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("10000"), false, lmp), 10000);
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("-532410"), false, lmp), -532410);
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("-0"), false, lmp), 0);
+    EXPECT_EQ(utils::tnumeric(FLERR, std::string("0100"), false, lmp), 100);
+
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::tnumeric(FLERR, "yay", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::tnumeric(FLERR, "0.1", false, lmp););
     TEST_FAILURE(".*ERROR: Expected integer.*", utils::tnumeric(FLERR, "1.1", false, lmp););
diff --git a/unittest/formats/test_text_file_reader.cpp b/unittest/formats/test_text_file_reader.cpp
index f0f695ae54..6fcc21fb33 100644
--- a/unittest/formats/test_text_file_reader.cpp
+++ b/unittest/formats/test_text_file_reader.cpp
@@ -35,8 +35,8 @@ class TextFileReaderTest : public ::testing::Test {
 protected:
     void TearDown() override
     {
-        unlink("text_reader_one.file");
-        unlink("text_reader_two.file");
+        platform::unlink("text_reader_one.file");
+        platform::unlink("text_reader_two.file");
     }
 
     void test_files()
@@ -65,16 +65,22 @@ TEST_F(TextFileReaderTest, nofile)
                  FileReaderException);
 }
 
+// this test cannot work on windows due to its non unix-like permission system
+
+#if !defined(_WIN32)
 TEST_F(TextFileReaderTest, permissions)
 {
+    platform::unlink("text_reader_noperms.file");
     FILE *fp = fopen("text_reader_noperms.file", "w");
+    ASSERT_NE(fp,nullptr);
     fputs("word\n", fp);
     fclose(fp);
     chmod("text_reader_noperms.file", 0);
     ASSERT_THROW({ TextFileReader reader("text_reader_noperms.file", "test"); },
                  FileReaderException);
-    unlink("text_reader_noperms.file");
+    platform::unlink("text_reader_noperms.file");
 }
+#endif
 
 TEST_F(TextFileReaderTest, nofp)
 {
diff --git a/unittest/fortran/CMakeLists.txt b/unittest/fortran/CMakeLists.txt
index fc116c3c40..6e7e165018 100644
--- a/unittest/fortran/CMakeLists.txt
+++ b/unittest/fortran/CMakeLists.txt
@@ -25,11 +25,11 @@ if(CMAKE_Fortran_COMPILER)
   add_library(flammps STATIC ${LAMMPS_FORTRAN_MODULE})
 
   add_executable(test_fortran_create wrap_create.cpp test_fortran_create.f90)
-  target_link_libraries(test_fortran_create PRIVATE flammps lammps MPI::MPI_Fortran GTest::GTest GTest::GTestMain)
+  target_link_libraries(test_fortran_create PRIVATE flammps lammps MPI::MPI_Fortran GTest::GTestMain)
   add_test(FortranOpen test_fortran_create)
 
   add_executable(test_fortran_commands wrap_commands.cpp test_fortran_commands.f90)
-  target_link_libraries(test_fortran_commands PRIVATE flammps lammps MPI::MPI_Fortran GTest::GTest GTest::GTestMain)
+  target_link_libraries(test_fortran_commands PRIVATE flammps lammps MPI::MPI_Fortran GTest::GTestMain)
   add_test(FortranCommands test_fortran_commands)
 else()
   message(STATUS "Skipping Tests for the LAMMPS Fortran Module: no Fortran compiler")
diff --git a/unittest/python/CMakeLists.txt b/unittest/python/CMakeLists.txt
index 5d0aad2f54..f61a9c61ab 100644
--- a/unittest/python/CMakeLists.txt
+++ b/unittest/python/CMakeLists.txt
@@ -21,7 +21,7 @@ else()
 endif()
 
 add_executable(test_python_package test_python_package.cpp)
-target_link_libraries(test_python_package PRIVATE lammps GTest::GMock GTest::GTest)
+target_link_libraries(test_python_package PRIVATE lammps GTest::GMock)
 target_compile_definitions(test_python_package PRIVATE -DTEST_INPUT_FOLDER=${TEST_INPUT_FOLDER})
 # this requires CMake 3.12. don't care to add backward compatibility for this.
 if(Python3_Development_FOUND)
@@ -34,12 +34,18 @@ set_tests_properties(PythonPackage PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${L
 if(Python_EXECUTABLE)
   # prepare to augment the environment so that the LAMMPS python module and the shared library is found.
   set(PYTHON_TEST_ENVIRONMENT PYTHONPATH=${LAMMPS_PYTHON_DIR}:$ENV{PYTHONPATH})
+  get_property(BUILD_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+  if(BUILD_IS_MULTI_CONFIG)
+    set(LAMMPS_LIB_PATH ${CMAKE_BINARY_DIR}/$<CONFIG>)
+  else()
+    set(LAMMPS_LIB_PATH ${CMAKE_BINARY_DIR})
+  endif()
   list(APPEND PYTHON_TEST_ENVIRONMENT "LAMMPS_POTENTIALS=${LAMMPS_POTENTIALS_DIR}")
   list(APPEND PYTHON_TEST_ENVIRONMENT "TEST_INPUT_DIR=${CMAKE_CURRENT_SOURCE_DIR}")
   if(APPLE)
-    list(APPEND PYTHON_TEST_ENVIRONMENT "DYLD_LIBRARY_PATH=${CMAKE_BINARY_DIR}:$ENV{DYLD_LIBRARY_PATH};LAMMPS_CMAKE_CACHE=${CMAKE_BINARY_DIR}/CMakeCache.txt")
+    list(APPEND PYTHON_TEST_ENVIRONMENT "DYLD_LIBRARY_PATH=${LAMMPS_LIB_PATH}:$ENV{DYLD_LIBRARY_PATH};LAMMPS_CMAKE_CACHE=${CMAKE_BINARY_DIR}/CMakeCache.txt")
   else()
-    list(APPEND PYTHON_TEST_ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}:$ENV{LD_LIBRARY_PATH};LAMMPS_CMAKE_CACHE=${CMAKE_BINARY_DIR}/CMakeCache.txt")
+    list(APPEND PYTHON_TEST_ENVIRONMENT "LD_LIBRARY_PATH=${LAMMPS_LIB_PATH}:$ENV{LD_LIBRARY_PATH};LAMMPS_CMAKE_CACHE=${CMAKE_BINARY_DIR}/CMakeCache.txt")
   endif()
   if(LAMMPS_MACHINE)
     # convert from '_machine' to 'machine'
diff --git a/unittest/utils/CMakeLists.txt b/unittest/utils/CMakeLists.txt
index 9f708861cc..28486048c4 100644
--- a/unittest/utils/CMakeLists.txt
+++ b/unittest/utils/CMakeLists.txt
@@ -1,21 +1,21 @@
 add_executable(test_tokenizer test_tokenizer.cpp)
-target_link_libraries(test_tokenizer PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_tokenizer PRIVATE lammps GTest::GMockMain)
 add_test(Tokenizer test_tokenizer)
 
 add_executable(test_mempool test_mempool.cpp)
-target_link_libraries(test_mempool PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_mempool PRIVATE lammps GTest::GMockMain)
 add_test(MemPool test_mempool)
 
 add_executable(test_argutils test_argutils.cpp)
-target_link_libraries(test_argutils PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_argutils PRIVATE lammps GTest::GMockMain)
 add_test(ArgUtils test_argutils)
 
 add_executable(test_utils test_utils.cpp)
-target_link_libraries(test_utils PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_utils PRIVATE lammps GTest::GMockMain)
 add_test(Utils test_utils)
 
 add_executable(test_platform test_platform.cpp)
-target_link_libraries(test_platform PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_platform PRIVATE lammps GTest::GMockMain)
 add_test(Platform test_platform)
 
 set_tests_properties(Utils Platform PROPERTIES
@@ -34,7 +34,7 @@ if(BUILD_SHARED_LIBS)
 endif()
 
 add_executable(test_fmtlib test_fmtlib.cpp)
-target_link_libraries(test_fmtlib PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest)
+target_link_libraries(test_fmtlib PRIVATE lammps GTest::GMockMain)
 add_test(FmtLib test_fmtlib)
 
 add_executable(test_math_eigen_impl test_math_eigen_impl.cpp)
diff --git a/unittest/utils/test_math_eigen_impl.cpp b/unittest/utils/test_math_eigen_impl.cpp
index 895a35080c..47ca8d9cca 100644
--- a/unittest/utils/test_math_eigen_impl.cpp
+++ b/unittest/utils/test_math_eigen_impl.cpp
@@ -48,7 +48,7 @@ inline static bool SimilarVec(Vector a, Vector b, int n, Scalar eps = 1.0e-06,
                               Scalar ratio = 1.0e-06, Scalar ratio_denom = 1.0)
 {
     for (int i = 0; i < n; i++)
-        if (not Similar(a[i], b[i], eps, ratio, ratio_denom)) return false;
+        if (! Similar(a[i], b[i], eps, ratio, ratio_denom)) return false;
     return true;
 }
 
@@ -61,7 +61,7 @@ inline static bool SimilarVecUnsigned(Vector a, Vector b, int n, Scalar eps = 1.
         return true;
     else {
         for (int i = 0; i < n; i++)
-            if (not Similar(a[i], -b[i], eps, ratio, ratio_denom)) return false;
+            if (! Similar(a[i], -b[i], eps, ratio, ratio_denom)) return false;
         return true;
     }
 }
diff --git a/unittest/utils/test_platform.cpp b/unittest/utils/test_platform.cpp
index 0f39534c31..ace546ba90 100644
--- a/unittest/utils/test_platform.cpp
+++ b/unittest/utils/test_platform.cpp
@@ -37,7 +37,7 @@ TEST(Platform, clock)
     ASSERT_GT(ct_used, 1e-4);
 }
 
-TEST(Platform, putenv)
+TEST(Platform, putenv_unsetenv)
 {
     const char *var = getenv("UNITTEST_VAR1");
     ASSERT_EQ(var, nullptr);
@@ -65,6 +65,14 @@ TEST(Platform, putenv)
     ASSERT_THAT(var, StrEq("one=two"));
 
     ASSERT_EQ(platform::putenv(""), -1);
+
+    ASSERT_EQ(platform::unsetenv(""), -1);
+    ASSERT_EQ(platform::unsetenv("UNITTEST_VAR3=two"), -1);
+    var    = getenv("UNITTEST_VAR1");
+    ASSERT_NE(var, nullptr);
+    ASSERT_EQ(platform::unsetenv("UNITTEST_VAR1"), 0);
+    var    = getenv("UNITTEST_VAR1");
+    ASSERT_EQ(var, nullptr);
 }
 
 TEST(Platform, list_pathenv)