diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml
index 7be2c4fc46..e3567140fb 100644
--- a/.github/workflows/style-check.yml
+++ b/.github/workflows/style-check.yml
@@ -35,3 +35,4 @@ jobs:
          make check-permissions
          make check-homepage
          make check-errordocs
+         make check-fmtlib
diff --git a/.github/workflows/unittest-arm64.yml b/.github/workflows/unittest-arm64.yml
new file mode 100644
index 0000000000..094c5fb0c1
--- /dev/null
+++ b/.github/workflows/unittest-arm64.yml
@@ -0,0 +1,81 @@
+# GitHub action to build LAMMPS on Linux with ARM64 and run standard unit tests
+name: "Unittest for Linux on ARM64"
+
+on:
+  push:
+    branches: [develop]
+
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{github.event_name == 'pull_request'}}
+
+jobs:
+  build:
+    name: Linux ARM64 Unit Test
+    if: ${{ github.repository == 'lammps/lammps' }}
+    runs-on: ubuntu-22.04-arm
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 2
+
+    - name: Install extra packages
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ccache \
+                                libeigen3-dev \
+                                libcurl4-openssl-dev \
+                                mold \
+                                ninja-build \
+                                python3-dev
+
+    - name: Create Build Environment
+      run: mkdir build
+
+    - name: Set up ccache
+      uses: actions/cache@v4
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        key: linux-unit-ccache-${{ github.sha }}
+        restore-keys: linux-unit-ccache-
+
+    - name: Building LAMMPS via CMake
+      shell: bash
+      run: |
+        ccache -z
+        python3 -m venv linuxenv
+        source linuxenv/bin/activate
+        python3 -m pip install numpy
+        python3 -m pip install pyyaml
+        cmake -S cmake -B build \
+              -C cmake/presets/gcc.cmake \
+              -C cmake/presets/most.cmake \
+              -D CMAKE_CXX_COMPILER_LAUNCHER=ccache \
+              -D CMAKE_C_COMPILER_LAUNCHER=ccache \
+              -D BUILD_SHARED_LIBS=on \
+              -D DOWNLOAD_POTENTIALS=off \
+              -D ENABLE_TESTING=on \
+              -D MLIAP_ENABLE_ACE=on \
+              -D MLIAP_ENABLE_PYTHON=off \
+              -D PKG_MANIFOLD=on \
+              -D PKG_ML-PACE=on \
+              -D PKG_ML-RANN=on \
+              -D PKG_RHEO=on \
+              -D PKG_PTM=on \
+              -D PKG_PYTHON=on \
+              -D PKG_QTB=on \
+              -D PKG_SMTBQ=on \
+              -G Ninja
+        cmake --build build
+        ccache -s
+
+    - name: Run Tests
+      working-directory: build
+      shell: bash
+      run: ctest -V -LE unstable
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ce7b9f30f9..ff0d69e316 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -98,21 +98,24 @@ check_for_autogen_files(${LAMMPS_SOURCE_DIR})
 #####################################################################
 include(CheckIncludeFileCXX)
 
-# set required compiler flags, apply checks, and compiler/CPU arch specific optimizations
+# set required compiler flags and compiler/CPU arch specific optimizations
 if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-  # Intel classic compilers version 19 are broken and fail to compile the embedded fmtlib
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 20.0)
-    message(ERROR "Intel classic compiler version ${CMAKE_CXX_COMPILER_VERSION} is too old")
-  endif()
-
   if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qrestrict")
     endif()
-    set(CMAKE_TUNE_DEFAULT "/QxHost")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+      set(CMAKE_TUNE_DEFAULT "/QxCOMMON-AVX512")
+    else()
+      set(CMAKE_TUNE_DEFAULT "/QxHost")
+    endif()
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
-    set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=11074 -diag-disable=11076 -diag-disable=2196")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+      set(CMAKE_TUNE_DEFAULT "-xCOMMON-AVX512")
+    else()
+      set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=11074 -diag-disable=11076 -diag-disable=2196")
+    endif()
   endif()
 endif()
 
diff --git a/cmake/Modules/Packages/INTEL.cmake b/cmake/Modules/Packages/INTEL.cmake
index e6755bf23b..6fb1c57e8a 100644
--- a/cmake/Modules/Packages/INTEL.cmake
+++ b/cmake/Modules/Packages/INTEL.cmake
@@ -72,6 +72,10 @@ if(INTEL_ARCH STREQUAL "KNL")
   if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
     message(FATAL_ERROR "Must use Intel compiler with INTEL for KNL architecture")
   endif()
+  message(WARNING, "Support for Intel Xeon Phi accelerators and Knight's Landing CPUs "
+          "will be removed from LAMMPS in Summer 2025 due to lack of available machines "
+          "in labs and HPC centers and removed support in recent compilers "
+          "Please contact developers@lammps.org if you have any concerns about this step.")
   set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -xHost -qopenmp -qoffload")
   set(MIC_OPTIONS "-qoffload-option,mic,compiler,\"-fp-model fast=2 -mGLOB_default_function_attrs=\\\"gather_scatter_loop_unroll=4\\\"\"")
   target_compile_options(lammps PRIVATE -xMIC-AVX512 -qoffload -fno-alias -ansi-alias -restrict -qoverride-limits ${MIC_OPTIONS})
diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index 2fa5a449fb..2731b0df14 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -117,7 +117,6 @@ set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
-                       ${KOKKOS_PKG_SOURCES_DIR}/group_kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/min_kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/min_linesearch_kokkos.cpp
                        ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst
index fd68ce3e39..7c73583a4f 100644
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`fep/ta <compute_fep_ta>`
    * :doc:`force/tally <compute_tally>`
    * :doc:`fragment/atom <compute_cluster_atom>`
+   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
    * :doc:`global/atom <compute_global_atom>`
    * :doc:`group/group <compute_group_group>`
    * :doc:`gyration <compute_gyration>`
@@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`smd/vol <compute_smd_vol>`
    * :doc:`snap <compute_sna_atom>`
    * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
    * :doc:`snad/atom <compute_sna_atom>`
    * :doc:`snav/atom <compute_sna_atom>`
    * :doc:`sph/e/atom <compute_sph_e_atom>`
diff --git a/doc/src/Developer_code_design.rst b/doc/src/Developer_code_design.rst
index 974266ec7f..02c0e7731b 100644
--- a/doc/src/Developer_code_design.rst
+++ b/doc/src/Developer_code_design.rst
@@ -300,18 +300,19 @@ Formatting with the {fmt} library
 
 The LAMMPS source code includes a copy of the `{fmt} library
 <https://fmt.dev>`_, which is preferred over formatting with the
-"printf()" family of functions.  The primary reason is that it allows
-a typesafe default format for any type of supported data.  This is
+"printf()" family of functions.  The primary reason is that it allows a
+typesafe default format for any type of supported data.  This is
 particularly useful for formatting integers of a given size (32-bit or
-64-bit) which may require different format strings depending on
-compile time settings or compilers/operating systems.  Furthermore,
-{fmt} gives better performance, has more functionality, a familiar
-formatting syntax that has similarities to ``format()`` in Python, and
-provides a facility that can be used to integrate format strings and a
-variable number of arguments into custom functions in a much simpler
-way than the varargs mechanism of the C library.  Finally, {fmt} has
-been included into the C++20 language standard, so changes to adopt it
-are future-proof.
+64-bit) which may require different format strings depending on compile
+time settings or compilers/operating systems.  Furthermore, {fmt} gives
+better performance, has more functionality, a familiar formatting syntax
+that has similarities to ``format()`` in Python, and provides a facility
+that can be used to integrate format strings and a variable number of
+arguments into custom functions in a much simpler way than the varargs
+mechanism of the C library.  Finally, {fmt} has been included into the
+C++20 language standard as ``std::format()``, so changes to adopt it are
+future-proof, for as long as they are not using any extensions that are
+not (yet) included into C++.
 
 Formatted strings are frequently created by calling the
 ``fmt::format()`` function, which will return a string as a
@@ -319,11 +320,13 @@ Formatted strings are frequently created by calling the
 ``printf()``, the {fmt} library uses ``{}`` to embed format descriptors.
 In the simplest case, no additional characters are needed, as {fmt} will
 choose the default format based on the data type of the argument.
-Otherwise, the ``fmt::print()`` function may be used instead of
-``printf()`` or ``fprintf()``.  In addition, several LAMMPS output
-functions, that originally accepted a single string as argument have
-been overloaded to accept a format string with optional arguments as
-well (e.g., ``Error::all()``, ``Error::one()``, ``utils::logmesg()``).
+Otherwise, the :cpp:func:`utils::print() <LAMMPS_NS::utils::print>`
+function may be used instead of ``printf()`` or ``fprintf()``.  In
+addition, several LAMMPS output functions, that originally accepted a
+single string as argument have been overloaded to accept a format string
+with optional arguments as well (e.g., ``Error::all()``,
+``Error::one()``, :cpp:func:`utils::logmesg()
+<LAMMPS_NS::utils::logmesg>`).
 
 Summary of the {fmt} format syntax
 ==================================
diff --git a/doc/src/Developer_notes.rst b/doc/src/Developer_notes.rst
index af26b4b913..2d136055a4 100644
--- a/doc/src/Developer_notes.rst
+++ b/doc/src/Developer_notes.rst
@@ -7,13 +7,7 @@ typically document what a variable stores, what a small section of
 code does, or what a function does and its input/outputs.  The topics
 on this page are intended to document code functionality at a higher level.
 
-Available topics are:
-
-- `Reading and parsing of text and text files`_
-- `Requesting and accessing neighbor lists`_
-- `Choosing between a custom atom style, fix property/atom, and fix STORE/ATOM`_
-- `Fix contributions to instantaneous energy, virial, and cumulative energy`_
-- `KSpace PPPM FFT grids`_
+.. contents::
 
 ----
 
@@ -218,6 +212,146 @@ command:
 
    neighbor->add_request(this, "delete_atoms", NeighConst::REQ_FULL);
 
+
+Errors, warnings, and informational messages
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LAMMPS has specialized functionality to handle errors (which should
+terminate LAMMPS), warning messages (which should indicate possible
+problems *without* terminating LAMMPS), and informational text for
+messages about the progress and chosen settings.  We *strongly*
+encourage using these facilities and to *stay away* from using
+``printf()`` or ``fprintf()`` or ``std::cout`` or ``std::cerr`` and
+calling ``MPI_Abort()`` or ``exit()`` directly.  Warnings and
+informational messages should be printed only on MPI rank 0 to avoid
+flooding the output when running in parallel with many MPI processes.
+
+**Errors**
+
+When LAMMPS encounters an error, for example a syntax error in the
+input, then a suitable error message should be printed giving a brief,
+one line remark about the reason and then call either ``Error::all()``
+or ``Error::one()``.  ``Error::all()`` must be called when the failing
+code path is executed by *all* MPI processes and the error condition
+will appear for *all* MPI processes the same.  If desired, each MPI
+process may set a flag to either 0 or 1 and then MPI_Allreduce()
+searching for the maximum can be used to determine if there was an error
+on *any* of the MPI processes and make this information available to
+*all*.  ``Error::one()`` in contrast needs to be called when only one or
+a few MPI processes execute the code path or can have the error
+condition.  ``Error::all()`` is generally the preferred option.
+
+Calling these functions does not abort LAMMPS directly, but rather
+throws either a ``LAMMPSException`` (from ``Error::all()``) or a
+``LAMMPSAbortException`` (from ``Error::one()``).  These exceptions are
+caught by the LAMMPS ``main()`` program and then handled accordingly.
+The reason for this approach is to support applications, especially
+graphical applications like :ref:`LAMMPS-GUI <lammps_gui>`, that are
+linked to the LAMMPS library and have a mechanism to avoid that an error
+in LAMMPS terminates the application. By catching the exceptions, the
+application can delete the failing LAMMPS class instance and create a
+new one to try again.  In a similar fashion, the :doc:`LAMMPS Python
+module <Python_module>` checks for this and then re-throws corresponding
+Python exception, which in turn can be caught by the calling Python
+code.
+
+There are multiple "signatures" that can be called:
+
+- ``Error::all(FLERR, "Error message")``: this will abort LAMMPS with
+  the error message "Error message", followed by the last line of input
+  that was read and processed before the error condition happened.
+
+- ``Error::all(FLERR, Error::NOLASTLINE, "Error message")``: this is the
+  same as before but without the last line of input.  This is preferred
+  for errors that would happen *during* a :doc:`run <run>` or
+  :doc:`minimization <minimize>`, since showing the "run" or "minimize"
+  command would be the last line, but is unrelated to the error.
+
+- ``Error::all(FLERR, idx, "Error message")``: this is for argument
+  parsing where "idx" is the index (starting at 0) of the argument for a
+  LAMMPS command that is causing the failure (use -1 for the command
+  itself).  The output may also include the last input line *before* and
+  *after*, if they differ due to substituting variables.  A textual
+  indicator is pointing to the specific word that failed.  Using the
+  constant ``Error::NOPOINTER`` in place of the *idx* argument will
+  suppress the marker and then the behavior is like the *idx* argument
+  is not provided.
+
+FLERR is a macro containing the filename and line where the Error class
+is called and that information is appended to the error message.  This
+allows to quickly find the relevant source code causing the error.  For
+all three signatures, the single string "Error message" may be replaced
+with a format string using '{}' placeholders and followed by a variable
+number of arguments, one for each placeholder. This format string and
+the arguments are then handed for formatting to the `{fmt} library
+<https://fmt.dev>`_ (which is bundled with LAMMPS) and thus allow
+processing similar to the "format()" functionality in Python.
+
+.. note::
+
+   For commands like :doc:`fix ave/time <fix_ave_time>` that accept
+   wildcard arguments, the :cpp:func:`utils::expand_args` function
+   may be passed as an optional argument where the function will provide
+   a map to the original arguments from the expanded argument indices.
+
+For complex errors, that can have multiple causes and which cannot be
+explained in a single line, you can append to the error message, the
+string created by :cpp:func:`utils::errorurl`, which then provides a
+URL pointing to a paragraph of the :doc:`Errors_details` that
+corresponds to the number provided. Example:
+
+.. code-block:: c++
+
+   error->all(FLERR, "Unknown identifier in data file: {}{}", keyword, utils::errorurl(1));
+
+This will output something like this:
+
+.. parsed-literal::
+
+   ERROR: Unknown identifier in data file: Massess
+   For more information see https://docs.lammps.org/err0001 (src/read_data.cpp:1482)
+   Last input line: read_data       data.peptide
+
+Where the URL points to the first paragraph with explanations on
+the :doc:`Errors_details` page in the manual.
+
+**Warnings**
+
+To print warnings, the ``Errors::warning()`` function should be used.
+It also requires the FLERR macros as first argument to easily identify
+the location of the warning in the source code.  Same as with the error
+functions above, the function has two variants: one just taking a single
+string as final argument and a second that uses the `{fmt} library
+<https://fmt.dev>`_ to make it similar to, say, ``fprintf()``.  One
+motivation to use this function is that it will output warnings with
+always the same capitalization of the leading "WARNING" string.  A
+second is that it has a built in rate limiter.  After a given number (by
+default 100), that can be set via the :doc:`thermo_modify command
+<thermo_modify>` no more warnings are printed.  Also, warnings are
+written consistently to both screen and logfile or not, depending on the
+settings for :ref:`screen <screen>` or :doc:`logfile <log>` output.
+
+.. note::
+
+   Unlike ``Error::all()``, the warning function will produce output on
+   *every* MPI process, so it typically would be prefixed with an if
+   statement testing for ``comm->me == 0``, i.e. limiting output to MPI
+   rank 0.
+
+**Informational messages**
+
+Finally, for informational message LAMMPS has the
+:cpp:func:`utils::logmesg() convenience function
+<LAMMPS_NS::utils::logmesg>`.  It also uses the `{fmt} library
+<https://fmt.dev>`_ to support using a format string followed by a
+matching number of arguments.  It will output the resulting formatted
+text to both, the screen and the logfile and will honor the
+corresponding settings about whether this output is active and to which
+file it should be send.  Same as for ``Error::warning()``, it would
+produce output for every MPI process and thus should usually be called
+only on MPI rank 0 to avoid flooding the output when running with many
+parallel processes.
+
 Choosing between a custom atom style, fix property/atom, and fix STORE/ATOM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst
index b7f62d5364..866945fc88 100644
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@@ -133,6 +133,9 @@ and parsing files or arguments.
 .. doxygenfunction:: trim_comment
    :project: progguide
 
+.. doxygenfunction:: strcompress
+   :project: progguide
+
 .. doxygenfunction:: strip_style_suffix
    :project: progguide
 
@@ -166,6 +169,9 @@ and parsing files or arguments.
 .. doxygenfunction:: split_lines
    :project: progguide
 
+.. doxygenfunction:: strsame
+   :project: progguide
+
 .. doxygenfunction:: strmatch
    :project: progguide
 
@@ -232,12 +238,21 @@ Convenience functions
 .. doxygenfunction:: logmesg(LAMMPS *lmp, const std::string &mesg)
    :project: progguide
 
+.. doxygenfunction:: print(FILE *fp, const std::string &format, Args&&... args)
+   :project: progguide
+
+.. doxygenfunction:: print(FILE *fp, const std::string &mesg)
+   :project: progguide
+
 .. doxygenfunction:: errorurl
    :project: progguide
 
 .. doxygenfunction:: missing_cmd_args
    :project: progguide
 
+.. doxygenfunction:: point_to_error
+   :project: progguide
+
 .. doxygenfunction:: flush_buffers(LAMMPS *lmp)
    :project: progguide
 
diff --git a/doc/src/Developer_write_fix.rst b/doc/src/Developer_write_fix.rst
index afa569b05d..1b578823bd 100644
--- a/doc/src/Developer_write_fix.rst
+++ b/doc/src/Developer_write_fix.rst
@@ -96,8 +96,8 @@ Here the we specify which methods of the fix should be called during
      MPI_Allreduce(localAvgVel, globalAvgVel, 4, MPI_DOUBLE, MPI_SUM, world);
      scale3(1.0 / globalAvgVel[3], globalAvgVel);
      if ((comm->me == 0) && screen) {
-       fmt::print(screen,"{}, {}, {}\n",
-                  globalAvgVel[0], globalAvgVel[1], globalAvgVel[2]);
+       utils::print(screen, "{}, {}, {}\n",
+                    globalAvgVel[0], globalAvgVel[1], globalAvgVel[2]);
      }
    }
 
diff --git a/doc/src/Run_output.rst b/doc/src/Run_output.rst
index 2025bf5321..28ed891765 100644
--- a/doc/src/Run_output.rst
+++ b/doc/src/Run_output.rst
@@ -178,3 +178,64 @@ with and without the communication and a Gflop rate is computed.  The
 3d rate is with communication; the 1d rate is without (just the 1d
 FFTs).  Thus you can estimate what fraction of your FFT time was spent
 in communication, roughly 75% in the example above.
+
+Error message output
+====================
+
+Depending on the error function arguments when it is called in the
+source code, there will be one to four lines of error output.
+
+A single line
+^^^^^^^^^^^^^
+
+The line starts with "ERROR: ", followed by the error message and
+information about the location in the source where the error function
+was called in parenthesis on the right (here: line 131 of the file
+src/fix_print.cpp). Example:
+
+.. parsed-literal::
+
+   ERROR: Fix print timestep variable nevery returned a bad timestep: 9900 (src/fix_print.cpp:131)
+
+Two lines
+^^^^^^^^^
+
+In addition to the single line output, also the last line of the input
+will be repeated.  If a command is spread over multiple lines in the
+input using the continuation character '&', then the error will print
+the entire concatenated line.  For readability all whitespace is
+compressed to single blanks.  Example:
+
+.. parsed-literal::
+
+   ERROR: Unrecognized fix style 'printf' (src/modify.cpp:924)
+   Last input line: fix 0 all printf v_nevery "Step: $(step) ${step}"
+
+Three lines
+^^^^^^^^^^^
+
+In addition to the two line output from above, a third line is added
+that uses caret character markers '^' to indicate which "word" in the
+input failed.  Example:
+
+.. parsed-literal::
+
+   ERROR: Illegal fix print nevery value -100; must be > 0 (src/fix_print.cpp:41)
+   Last input line: fix 0 all print -100 "Step: $(step) ${stepx}"
+                                    ^^^^
+
+Four lines
+^^^^^^^^^^
+
+The three line output is expanded to four lines, if the the input is
+modified through input pre-processing, e.g. when substituting
+variables. Now the last command is printed once in the original form and
+a second time after substitutions are applied.  The caret character
+markers '^' are applied to the second version.  Example:
+
+.. parsed-literal::
+
+   ERROR: Illegal fix print nevery value -100; must be > 0 (src/fix_print.cpp:41)
+   Last input line: fix 0 all print ${nevery} 'Step: $(step) ${step}'
+   --> parsed line: fix 0 all print -100 "Step: $(step) ${step}"
+                                    ^^^^
diff --git a/doc/src/Speed_kokkos.rst b/doc/src/Speed_kokkos.rst
index 9f8dcf8340..f8379949a4 100644
--- a/doc/src/Speed_kokkos.rst
+++ b/doc/src/Speed_kokkos.rst
@@ -67,6 +67,14 @@ version 23 November 2023 and Kokkos version 4.2.
    To build with Kokkos support for AMD GPUs, the AMD ROCm toolkit
    software version 5.2.0 or later must be installed on your system.
 
+.. admonition:: Intel Data Center GPU support
+   :class: note
+
+   Support for Kokkos with Intel Data Center GPU accelerators (formerly
+   known under the code name "Ponte Vecchio") in LAMMPS is still a work
+   in progress.  Only a subset of the functionality works correctly.
+   Please contact the LAMMPS developers if you run into problems.
+
 .. admonition:: CUDA and MPI library compatibility
    :class: note
 
@@ -80,13 +88,15 @@ version 23 November 2023 and Kokkos version 4.2.
    LAMMPS command-line or by using the command :doc:`package kokkos
    gpu/aware off <package>` in the input file.
 
-.. admonition:: Intel Data Center GPU support
+.. admonition:: Using multiple MPI ranks per GPU
    :class: note
 
-   Support for Kokkos with Intel Data Center GPU accelerators (formerly
-   known under the code name "Ponte Vecchio") in LAMMPS is still a work
-   in progress.  Only a subset of the functionality works correctly.
-   Please contact the LAMMPS developers if you run into problems.
+   Unlike with the GPU package, there are limited benefits from using
+   multiple MPI processes per GPU with KOKKOS.  But when doing this it
+   is **required** to enable CUDA MPS (`Multi-Process Service :: GPU
+   Deployment and Management Documentation
+   <https://docs.nvidia.com/deploy/mps/index.html>`_ ) to get acceptable
+   performance.
 
 Building LAMMPS with the KOKKOS package
 """""""""""""""""""""""""""""""""""""""
@@ -365,13 +375,13 @@ one or more nodes, each with two GPUs:
 
 .. note::
 
-   When using a GPU, you will achieve the best performance if your
-   input script does not use fix or compute styles which are not yet
+   When using a GPU, you will achieve the best performance if your input
+   script does not use fix or compute styles which are not yet
    Kokkos-enabled. This allows data to stay on the GPU for multiple
    timesteps, without being copied back to the host CPU. Invoking a
-   non-Kokkos fix or compute, or performing I/O for
-   :doc:`thermo <thermo_style>` or :doc:`dump <dump>` output will cause data
-   to be copied back to the CPU incurring a performance penalty.
+   non-Kokkos fix or compute, or performing I/O for :doc:`thermo
+   <thermo_style>` or :doc:`dump <dump>` output will cause data to be
+   copied back to the CPU incurring a performance penalty.
 
 .. note::
 
@@ -379,6 +389,56 @@ one or more nodes, each with two GPUs:
    kspace, etc., you must set the environment variable ``CUDA_LAUNCH_BLOCKING=1``.
    However, this will reduce performance and is not recommended for production runs.
 
+Troubleshooting segmentation faults on GPUs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As noted above, KOKKOS by default assumes that the MPI library is
+GPU-aware.  This is not always the case and can lead to segmentation
+faults when using more than one MPI process.  Normally, LAMMPS will
+print a warning like "*Turning off GPU-aware MPI since it is not
+detected*", or an error message like "*Kokkos with GPU-enabled backend
+assumes GPU-aware MPI is available*", OR a **segmentation fault**.  To
+confirm that a segmentation fault is caused by this, you can turn off
+the GPU-aware assumption via the :doc:`package kokkos command <package>`
+or the corresponding command-line flag.
+
+If you still get a segmentation fault, despite running with only one MPI
+process or using the command-line flag to turn off expecting a GPU-aware
+MPI library, then using the CMake compile setting
+``-DKokkos_ENABLE_DEBUG=on`` or adding ``KOKKOS_DEBUG=yes`` to your
+machine makefile for building with traditional make will generate useful
+output that can be passed to the LAMMPS developers for further
+debugging.
+
+Troubleshooting memory allocation on GPUs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`Kokkos Tools <https://github.com/kokkos/kokkos-tools/>`_ provides a set
+of lightweight profiling and debugging utilities, which interface with
+instrumentation hooks (eg. `space-time-stack
+<https://github.com/kokkos/kokkos-tools/tree/develop/profiling/space-time-stack>`_)
+built directly into the Kokkos runtime.  After compiling a dynamic LAMMPS
+library, you then have to set the environment variable ``KOKKOS_TOOLS_LIBS``
+before executing your LAMMPS Kokkos run. Example:
+
+.. code-block:: bash
+
+    export KOKKOS_TOOLS_LIBS=${HOME}/kokkos-tools/src/tools/memory-events/kp_memory_event.so
+    mpirun -np 4 lmp_kokkos_cuda_openmpi -in in.lj -k on g 4 -sf kk
+
+Starting with the NVIDIA Pascal GPU architecture, CUDA supports
+`"Unified Virtual Memory" (UVM)
+<https://developer.nvidia.com/blog/unified-memory-cuda-beginners/>`_
+which enables allocating more memory than a GPU possesses by also using
+memory on the host CPU and then CUDA will transparently move data
+between CPU and GPU as needed.  The resulting LAMMPS performance depends
+on `memory access pattern, data residency, and GPU memory
+oversubscription
+<https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/>`_
+. The CMake option ``-DKokkos_ENABLE_CUDA_UVM=on`` or the makefile
+setting ``KOKKOS_CUDA_OPTIONS=enable_lambda,force_uvm`` enables using
+:ref:`UVM with Kokkos <kokkos>` when compiling LAMMPS.
+
 Run with the KOKKOS package by editing an input script
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/doc/src/compute.rst b/doc/src/compute.rst
index 082f93a6c4..9a8a1734fb 100644
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
+* :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst
new file mode 100644
index 0000000000..4ae99e7b55
--- /dev/null
+++ b/doc/src/compute_gaussian_grid_local.rst
@@ -0,0 +1,97 @@
+.. index:: compute gaussian/grid/local
+.. index:: compute gaussian/grid/local/kk
+
+compute gaussian/grid/local command
+===================================
+
+Accelerator Variants: *gaussian/grid/local/kk*
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* gaussian/grid/local = style name of this compute command
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
+
+Description
+"""""""""""
+
+Define a computation that calculates a Gaussian representation of the ionic
+structure. This representation is used for the efficient evaluation
+of quantities related to the structure factor in a grid-based workflow,
+such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
+implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
+
+For each LAMMPS type, a separate sum of Gaussians is calculated, using
+a separate Gaussian broadening per type. The computation
+is always performed on the numerical grid, no atom-based version of this
+compute exists. The Gaussian representation can only be executed in a local
+fashion, thus the output array only contains rows for grid points
+that are local to the processor subdomain. The layout of the grid is the same
+as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
+
+Namely, the array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  Each row of the array contains
+the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the values of the Gaussians
+(one floating point number per type per grid point).
+
+----------
+
+
+.. include:: accel_styles.rst
+
+
+
+----------
+
+Output info
+"""""""""""
+
+Compute *gaussian/grid/local* evaluates a local array.
+The array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
+where *ntypes* is the number of LAMMPS types. The first three columns are
+the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the *ntypes* columns
+containing the values of the Gaussians for each type.
+
+Restrictions
+""""""""""""
+
+These computes are part of the ML-SNAP package.  They are only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`compute sna/grid/local <compute_sna_atom>`
+
+----------
+
+.. _Ellis2021b:
+
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
+
+.. _Fiedler2023:
+
+**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
+`npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
+
diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst
index 179c362dc6..2572093499 100644
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
+.. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
+.. index:: compute sna/grid/local/kk
 
 compute sna/atom command
 ========================
@@ -20,9 +22,14 @@ compute snap command
 compute sna/grid command
 ========================
 
+compute sna/grid/kk command
+===========================
+
 compute sna/grid/local command
 ==============================
 
+Accelerator Variants: *sna/grid/local/kk*
+
 Syntax
 """"""
 
@@ -33,17 +40,17 @@ Syntax
    compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
    compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
    compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
 
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *w_1, w_2,...* = list of neighbor weights, one for each type
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*
 
@@ -103,7 +110,7 @@ Examples
    compute snap all snap 1.4 0.95 6 2.0 1.0
    compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
    compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
    compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2
 
 Description
@@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
+*sna/grid/kk* and *sna/grid/local/kk* commands, respectively.
 
 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.
 
+----------
+
+
+.. include:: accel_styles.rst
+
+
 ----------
 
 Output info
@@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987).
 
 .. _Ellis2021:
 
-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 
 .. _Lafourcade2023_2:
 
diff --git a/doc/src/delete_bonds.rst b/doc/src/delete_bonds.rst
index e03c4b3ac7..e6825ded33 100644
--- a/doc/src/delete_bonds.rst
+++ b/doc/src/delete_bonds.rst
@@ -62,6 +62,18 @@ For all styles, by default, an interaction is only turned off (or on)
 if all the atoms involved are in the specified group.  See the *any*
 keyword to change the behavior.
 
+.. admonition:: Possible errors caused by using *delete_bonds*
+   :class: warning
+
+   Since this command by default only *turns off* bonded interactions,
+   their definitions are still present and subject to the limitations
+   due to LAMMPS' domain decomposition based parallelization.  That is,
+   when a bond is turned off, the two constituent atoms may move apart
+   and may reach a distance where they can lead to a "bond atoms missing"
+   error and crash the simulation.  Adding the *remove* keyword (see
+   below) is required to fully remove those interactions and prevent
+   the error.
+
 Several of the styles (\ *atom*, *bond*, *angle*, *dihedral*, *improper*\ )
 take a *type* as an argument.  The specified *type* can be a
 :doc:`type label <Howto_type_labels>`.  Otherwise, the type should be an
@@ -98,15 +110,18 @@ of all interactions in the specified group is simply reported.  This
 is useful for diagnostic purposes if bonds have been turned off by a
 bond-breaking potential during a previous run.
 
-The default behavior of the delete_bonds command is to turn off
-interactions by toggling their type to a negative value, but not to
-permanently remove the interaction.  For example, a bond_type of 2 is set to
-:math:`-2.`  The neighbor list creation routines will not include such an
-interaction in their interaction lists.  The default is also to not
-alter the list of 1--2, 1--3, or 1--4 neighbors computed by the
-:doc:`special_bonds <special_bonds>` command and used to weight pairwise
-force and energy calculations.  This means that pairwise computations
-will proceed as if the bond (or angle, etc.) were still turned on.
+.. admonition:: Impact on special_bonds processing and exclusions
+   :class: note
+
+   The default behavior of the delete_bonds command is to turn off
+   interactions by toggling their type to a negative value, but not to
+   permanently remove the interaction.  For example, a bond_type of 2 is set to
+   :math:`-2.`  The neighbor list creation routines will not include such an
+   interaction in their interaction lists.  The default is also to not
+   alter the list of 1--2, 1--3, or 1--4 neighbors computed by the
+   :doc:`special_bonds <special_bonds>` command and used to weight pairwise
+   force and energy calculations.  This means that pairwise computations
+   will proceed as if the bond (or angle, etc.) were still turned on.
 
 Several keywords can be appended to the argument list to alter the
 default behaviors.
@@ -138,9 +153,11 @@ operation, after (optional) removal.  It re-computes the pairwise 1--2,
 turned-off bonds the same as turned-on.  Thus, turned-off bonds must
 be removed if you wish to change the weighting list.
 
-Note that the choice of *remove* and *special* options affects how
-1--2, 1--3, 1--4 pairwise interactions will be computed across bonds that
-have been modified by the delete_bonds command.
+.. note::
+
+   The choice of *remove* and *special* options affects how 1--2,
+   1--3, 1--4 pairwise interactions will be computed across bonds
+   that have been modified by the delete_bonds command.
 
 Restrictions
 """"""""""""
diff --git a/doc/src/pair_hybrid.rst b/doc/src/pair_hybrid.rst
index 617b0c4372..93e5621736 100644
--- a/doc/src/pair_hybrid.rst
+++ b/doc/src/pair_hybrid.rst
@@ -70,6 +70,12 @@ Examples
    pair_coeff 1 1 lj/cut 1.0 1.0 2.5
    pair_coeff 1 1 morse 1.0 1.0 1.0 2.5
 
+   variable peratom1 atom 1/(1+exp(-$k*vx^2)
+   variable peratom2 atom 1-v_peratom1
+   pair_style hybrid/scaled v_peratom1 lj/cut 2.5 v_peratom2 morse 2.5
+   pair_coeff 1 1 lj/cut 1.0 1.0 2.5
+   pair_coeff 1 1 morse 1.0 1.0 1.0 2.5
+
 Description
 """""""""""
 
@@ -78,7 +84,7 @@ styles enable the use of multiple pair styles in one simulation.  With
 the *hybrid* style, exactly one pair style is assigned to each pair of
 atom types.  With the *hybrid/overlay* and *hybrid/scaled* styles, one
 or more pair styles can be assigned to each pair of atom types.  With
-the hybrid/molecular style, pair styles are assigned to either intra-
+the *hybrid/molecular* style, pair styles are assigned to either intra-
 or inter-molecular interactions.
 
 The assignment of pair styles to type pairs is made via the
@@ -114,16 +120,26 @@ restrictions discussed below.
 
 If the *hybrid/scaled* style is used instead of *hybrid/overlay*,
 contributions from sub-styles are weighted by their scale factors, which
-may be fractional or even negative.  Furthermore the scale factors may
-be variables that may change during a simulation.  This enables
+may be fractional or even negative.  Furthermore the scale factor for
+each sub-style may a constant, an *equal* style variable, or an *atom*
+style variable. Variable scale factors may change during the simulation.
+Different sub-styles may use different scale factor styles.
+In the case of a sub-style scale factor that is an *atom* style variable,
+the force contribution to each atom from that sub-style is weighted
+by the value of the variable for that atom, while the contribution
+from that sub-style to the global potential energy is zero.
+All other contributions to the per-atom energy, per-atom
+virial, and global virial (if not obtained from forces)
+from that sub-style are zero.
+This enables
 switching smoothly between two different pair styles or two different
 parameter sets during a run in a similar fashion as could be done
 with :doc:`fix adapt <fix_adapt>` or :doc:`fix alchemy <fix_alchemy>`.
-
 All pair styles that will be used are listed as "sub-styles" following
 the *hybrid* or *hybrid/overlay* keyword, in any order.  In case of the
 *hybrid/scaled* pair style, each sub-style is prefixed with a scale
-factor.  The scale factor is either a floating point number or an equal
+factor.  The scale factor is either a floating point number or an
+*equal* or *atom*
 style (or equivalent) variable.  Each sub-style's name is followed by
 its usual arguments, as illustrated in the examples above.  See the doc
 pages of the individual pair styles for a listing and explanation of the
@@ -374,7 +390,7 @@ between all atoms of types 1,3,4 will be computed by that potential.
 Pair_style hybrid allows interactions between type pairs 2-2, 1-2,
 2-3, 2-4 to be specified for computation by other pair styles.  You
 could even add a second interaction for 1-1 to be computed by another
-pair style, assuming pair_style hybrid/overlay is used.
+pair style, assuming pair_style *hybrid/overlay* is used.
 
 But you should not, as a general rule, attempt to exclude the many-body
 interactions for some subset of the type pairs within the set of 1,3,4
@@ -414,7 +430,7 @@ passed to the Tersoff potential, which means it would compute no
 3-body interactions containing both type 1 and 2 atoms.
 
 Here is another example to use 2 many-body potentials together in an
-overlapping manner using hybrid/overlay.  Imagine you have CNT (C atoms)
+overlapping manner using *hybrid/overlay*.  Imagine you have CNT (C atoms)
 on a Si surface.  You want to use Tersoff for Si/Si and Si/C
 interactions, and AIREBO for C/C interactions.  Si atoms are type 1; C
 atoms are type 2.  Something like this will work:
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index b59d7ae24a..45e102b007 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -1237,6 +1237,7 @@ fp
 fphi
 fPIC
 fplo
+fprintf
 Fqq
 Fraige
 framerate
@@ -3381,6 +3382,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
+Schmerler
 Schmid
 Schnieders
 Schoen
@@ -4043,6 +4045,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
+Vogel
 volfactor
 Volkov
 Volpe
diff --git a/examples/snap/README.md b/examples/snap/README.md
index 305f920ae8..1df24acf1f 100644
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
+in.C_SNAP                         # SNAP carbon potential
 
 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
+
+in.snap.grid                      # SNAP descriptors on a grid
+in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
+in.gaussian.grid                  # Gaussian descriptors on a grid
+
diff --git a/examples/snap/in.gaussian.grid b/examples/snap/in.gaussian.grid
new file mode 100644
index 0000000000..48aeec1632
--- /dev/null
+++ b/examples/snap/in.gaussian.grid
@@ -0,0 +1,68 @@
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+# 
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable 	ny equal ${nrep}
+variable 	nz equal ${nrep}
+
+boundary	p p p
+
+lattice		custom $a &
+		a1 1 0 0 &
+		a2 0 1 0  &
+		a3 0 0 1 &
+		basis 0 0 0 &
+		basis 0.5 0.5 0.5 &
+
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box	2 box
+create_atoms	1 box basis 1 1 basis 2 2
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string &
+		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+		
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
+	 	${gaussian_options}
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
diff --git a/examples/snap/in.grid.snap b/examples/snap/in.snap.grid
similarity index 100%
rename from examples/snap/in.grid.snap
rename to examples/snap/in.snap.grid
diff --git a/examples/snap/in.grid.tri b/examples/snap/in.snap.grid.triclinic
similarity index 99%
rename from examples/snap/in.grid.tri
rename to examples/snap/in.snap.grid.triclinic
index 95a14f3bb4..59063f576e 100644
--- a/examples/snap/in.grid.tri
+++ b/examples/snap/in.snap.grid.triclinic
@@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1
 
-box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1
new file mode 100644
index 0000000000..b158ac07d0
--- /dev/null
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@@ -0,0 +1,129 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
+
+183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.088e-06  |            |       |100.00
+
+Nlocal:              2 ave           2 max           2 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            339 ave         339 max         339 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:             64 ave          64 max          64 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4
new file mode 100644
index 0000000000..54cc842bc7
--- /dev/null
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@@ -0,0 +1,130 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 2 by 2 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
+Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
+
+89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.238e-06  |            |       |100.00
+
+Nlocal:            0.5 ave           1 max           0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:          274.5 ave         275 max         274 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:             16 ave          40 max           0 min
+Histogram: 2 0 0 0 0 0 1 0 0 1
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
diff --git a/src/.gitignore b/src/.gitignore
index 6a3365af9b..07a77f4aba 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -252,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h
 
+/compute_gaussian_grid_local.cpp
+/compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
diff --git a/src/AMOEBA/fix_amoeba_pitorsion.cpp b/src/AMOEBA/fix_amoeba_pitorsion.cpp
index 33af4a3c31..352e559d6b 100644
--- a/src/AMOEBA/fix_amoeba_pitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_pitorsion.cpp
@@ -773,9 +773,9 @@ bigint FixAmoebaPiTorsion::read_data_skip_lines(char *keyword)
 
 void FixAmoebaPiTorsion::write_data_header(FILE *fp, int mth)
 {
-  if (mth == 0) fmt::print(fp,"{} pitorsions\n",npitorsions);
+  if (mth == 0) utils::print(fp,"{} pitorsions\n",npitorsions);
   else if (mth == 1)
-    fmt::print(fp, "{} pitorsion types\n",npitorsion_types);
+    utils::print(fp, "{} pitorsion types\n",npitorsion_types);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/BODY/body_nparticle.cpp b/src/BODY/body_nparticle.cpp
index 4a34f31f54..14841e6054 100644
--- a/src/BODY/body_nparticle.cpp
+++ b/src/BODY/body_nparticle.cpp
@@ -261,22 +261,22 @@ int BodyNparticle::write_data_body(FILE *fp, double *buf)
 
   // atomID ninteger ndouble
 
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
   m += 3;
 
   const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);
 
   // inertia
 
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
              buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
   m += 6;
 
   // nsub vertices
 
   for (int i = 0; i < nsub; i++) {
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
     m += 3;
   }
 
diff --git a/src/BODY/body_rounded_polygon.cpp b/src/BODY/body_rounded_polygon.cpp
index 5de0654d25..366db6264f 100644
--- a/src/BODY/body_rounded_polygon.cpp
+++ b/src/BODY/body_rounded_polygon.cpp
@@ -398,27 +398,27 @@ int BodyRoundedPolygon::write_data_body(FILE *fp, double *buf)
 
   // atomID ninteger ndouble
 
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
   m += 3;
 
   const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);
 
   // inertia
 
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
              buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
   m += 6;
 
   // nsub vertices
 
   for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
 
   // rounded diameter
 
   double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);
 
   return m;
 }
diff --git a/src/BODY/body_rounded_polyhedron.cpp b/src/BODY/body_rounded_polyhedron.cpp
index f34a212087..bd16dac96c 100644
--- a/src/BODY/body_rounded_polyhedron.cpp
+++ b/src/BODY/body_rounded_polyhedron.cpp
@@ -476,7 +476,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
 
   // atomID ninteger ndouble
 
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
   m += 3;
 
   // nvert, nedge, nface
@@ -484,27 +484,27 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
   const int nsub = (int) ubuf(buf[m++]).i;
   const int nedge = (int) ubuf(buf[m++]).i;
   const int nface = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{} {} {}\n",nsub,nedge,nface);
+  utils::print(fp,"{} {} {}\n",nsub,nedge,nface);
 
   // inertia
 
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
              buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
   m += 6;
 
   // nsub vertices
 
   for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
 
   // nedge 2-tuples and nface 4-tuples
   // unless nsub = 1 or 2
 
   if (nsub > 2) {
     for (int i = 0; i < nedge; i++, m+=2)
-      fmt::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
+      utils::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
     for (int i = 0; i < nface; i++, m+=4)
-      fmt::print(fp,"{} {} {} {}\n",
+      utils::print(fp,"{} {} {} {}\n",
                  static_cast<int> (buf[m]),static_cast<int> (buf[m+1]),
                  static_cast<int> (buf[m+2]),static_cast<int> (buf[m+3]));
   }
@@ -512,7 +512,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
   // rounded diameter
 
   double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);
 
   return m;
 }
diff --git a/src/BODY/compute_temp_body.cpp b/src/BODY/compute_temp_body.cpp
index 39b2518600..920dd9db00 100644
--- a/src/BODY/compute_temp_body.cpp
+++ b/src/BODY/compute_temp_body.cpp
@@ -57,6 +57,7 @@ ComputeTempBody::ComputeTempBody(LAMMPS *lmp, int narg, char **arg) :
     if (strcmp(arg[iarg],"bias") == 0) {
       if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "compute temp/body bias", error);
       tempbias = 1;
+      delete[] id_bias;
       id_bias = utils::strdup(arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"dof") == 0) {
diff --git a/src/BROWNIAN/fix_brownian_base.cpp b/src/BROWNIAN/fix_brownian_base.cpp
index 508ce4d1c6..6b00ec2985 100644
--- a/src/BROWNIAN/fix_brownian_base.cpp
+++ b/src/BROWNIAN/fix_brownian_base.cpp
@@ -33,8 +33,9 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 
 /* ---------------------------------------------------------------------- */
-
-FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
+FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) :
+    Fix(lmp, narg, arg), gamma_t_inv(nullptr), gamma_r_inv(nullptr), gamma_t_invsqrt(nullptr),
+    gamma_r_invsqrt(nullptr), dipole_body(nullptr), rng(nullptr)
 {
   time_integrate = 1;
 
@@ -47,18 +48,18 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
   planar_rot_flag = 0;
   g2 = 0.0;
 
-  if (narg < 5) error->all(FLERR, "Illegal fix brownian command.");
+  if (narg < 5) utils::missing_cmd_args(FLERR, "fix brownian", error);
 
   temp = utils::numeric(FLERR, arg[3], false, lmp);
-  if (temp <= 0) error->all(FLERR, "Fix brownian temp must be > 0.");
+  if (temp <= 0) error->all(FLERR, "Fix brownian temp must be > 0.0");
 
   seed = utils::inumeric(FLERR, arg[4], false, lmp);
-  if (seed <= 0) error->all(FLERR, "Fix brownian seed must be > 0.");
+  if (seed <= 0) error->all(FLERR, "Fix brownian seed must be > 0");
 
   int iarg = 5;
   while (iarg < narg) {
     if (strcmp(arg[iarg], "rng") == 0) {
-      if (narg == iarg + 1) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 1) utils::missing_cmd_args(FLERR, "fix brownian rng", error);
       if (strcmp(arg[iarg + 1], "uniform") == 0) {
         noise_flag = 1;
       } else if (strcmp(arg[iarg + 1], "gaussian") == 0) {
@@ -67,13 +68,14 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
       } else if (strcmp(arg[iarg + 1], "none") == 0) {
         noise_flag = 0;
       } else {
-        error->all(FLERR, "Illegal fix brownian command.");
+        error->all(FLERR, "Unknown fix brownian rng keyword {}", arg[iarg + 1]);
       }
       iarg = iarg + 2;
     } else if (strcmp(arg[iarg], "dipole") == 0) {
-      if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 3) utils::missing_cmd_args(FLERR, "fix brownian dipole", error);
 
       dipole_flag = 1;
+      delete[] dipole_body;
       dipole_body = new double[3];
 
       dipole_body[0] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
@@ -82,9 +84,11 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
       iarg = iarg + 4;
 
     } else if (strcmp(arg[iarg], "gamma_t_eigen") == 0) {
-      if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 3) utils::missing_cmd_args(FLERR, "fix brownian gamma_t_eigen", error);
 
       gamma_t_eigen_flag = 1;
+      delete[] gamma_t_inv;
+      delete[] gamma_t_invsqrt;
       gamma_t_inv = new double[3];
       gamma_t_invsqrt = new double[3];
       gamma_t_inv[0] = 1. / utils::numeric(FLERR, arg[iarg + 1], false, lmp);
@@ -111,6 +115,8 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
       if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
 
       gamma_r_eigen_flag = 1;
+      delete[] gamma_r_inv;
+      delete[] gamma_r_invsqrt;
       gamma_r_inv = new double[3];
       gamma_r_invsqrt = new double[3];
 
diff --git a/src/DPD-REACT/fix_rx.cpp b/src/DPD-REACT/fix_rx.cpp
index fe7538bd10..71a90607e5 100644
--- a/src/DPD-REACT/fix_rx.cpp
+++ b/src/DPD-REACT/fix_rx.cpp
@@ -119,7 +119,7 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
                     + " expected \"sparse\" or \"dense\"\n");
 
     if (comm->me == 0 && Verbosity > 1)
-      error->message(FLERR, fmt::format("FixRX: matrix format is {}",word));
+      error->message(FLERR, fmt::format("FixRX: matrix format is {}", word));
   }
 
   // Determine the ODE solver/stepper strategy in arg[6].
@@ -157,7 +157,7 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
     minSteps = utils::inumeric(FLERR,arg[iarg++],false,lmp);
 
     if (comm->me == 0 && Verbosity > 1)
-      error->message(FLERR,fmt::format("FixRX: RK4 numSteps= {}", minSteps));
+      error->message(FLERR, fmt::format("FixRX: RK4 numSteps= {}", minSteps));
   } else if (odeIntegrationFlag == ODE_LAMMPS_RK4 && narg>8) {
     error->all(FLERR,"Illegal fix rx command.  Too many arguments for RK4 solver.");
   } else if (odeIntegrationFlag == ODE_LAMMPS_RKF45) {
@@ -307,12 +307,19 @@ void FixRX::post_constructor()
   id_fix_species = utils::strdup(std::string(id)+"_SPECIES");
   id_fix_species_old = utils::strdup(std::string(id)+"_SPECIES_OLD");
 
-  const std::string fmtstr = "{} {} property/atom ";
-  auto newcmd1 = fmt::format(fmtstr,id_fix_species,group->names[igroup]);
-  auto newcmd2 = fmt::format(fmtstr,id_fix_species_old,group->names[igroup]);
+  std::string newcmd1 = id_fix_species;
+  newcmd1 += " ";
+  newcmd1 += group->names[igroup];
+  newcmd1 += " property/atom ";
+
+  std::string newcmd2 = id_fix_species_old;
+  newcmd2 += " ";
+  newcmd2 += group->names[igroup];
+  newcmd2 += " property/atom ";
+
   for (int ii=0; ii<nspecies; ii++) {
-    newcmd1 += fmt::format(" d_{}",tmpspecies[ii]);
-    newcmd2 += fmt::format(" d_{}Old",tmpspecies[ii]);
+    newcmd1 += fmt::format(" d_{}", tmpspecies[ii]);
+    newcmd2 += fmt::format(" d_{}Old", tmpspecies[ii]);
   }
   newcmd1 += " ghost yes";
   newcmd2 += " ghost yes";
diff --git a/src/ELECTRODE/fix_electrode_conp.cpp b/src/ELECTRODE/fix_electrode_conp.cpp
index a50984fe17..a64eb5f71d 100644
--- a/src/ELECTRODE/fix_electrode_conp.cpp
+++ b/src/ELECTRODE/fix_electrode_conp.cpp
@@ -1363,10 +1363,10 @@ int FixElectrodeConp::setmask()
 void FixElectrodeConp::write_to_file(FILE *file, const std::vector<tagint> &tags,
                                      const std::vector<std::vector<double>> &mat)
 {
-  for (const auto &t : tags) fmt::print(file, "{:20}", t);
+  for (const auto &t : tags) utils::print(file, "{:20}", t);
   fputs("\n", file);
   for (const auto &vec : mat) {
-    for (const auto &x : vec) fmt::print(file, "{:20.11e}", x);
+    for (const auto &x : vec) utils::print(file, "{:20.11e}", x);
     fputs("\n", file);
   }
 }
diff --git a/src/EXTRA-COMMAND/group2ndx.cpp b/src/EXTRA-COMMAND/group2ndx.cpp
index 56bf848923..e19d16f0f5 100644
--- a/src/EXTRA-COMMAND/group2ndx.cpp
+++ b/src/EXTRA-COMMAND/group2ndx.cpp
@@ -93,7 +93,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
     if (gid == 0) {
       fputs("[ System ]\n", fp);
     } else {
-      fmt::print(fp, "[ {} ]\n", group->names[gid]);
+      utils::print(fp, "[ {} ]\n", group->names[gid]);
     }
     width = log10((double) atom->natoms) + 2;
     cols = 80 / width;
@@ -142,7 +142,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
   if (fp) {
     int i, j;
     for (i = 0, j = 0; i < gcount; ++i) {
-      fmt::print(fp, "{:>{}}", recvlist[i], width);
+      utils::print(fp, "{:>{}}", recvlist[i], width);
       ++j;
       if (j == cols) {
         fputs("\n", fp);
diff --git a/src/EXTRA-COMPUTE/compute_adf.cpp b/src/EXTRA-COMPUTE/compute_adf.cpp
index 20b1749fa9..108e03e371 100644
--- a/src/EXTRA-COMPUTE/compute_adf.cpp
+++ b/src/EXTRA-COMPUTE/compute_adf.cpp
@@ -39,6 +39,8 @@ using MathConst::RAD2DEG;
 
 enum { DEGREE, RADIAN, COSINE };
 
+static constexpr double BIG = 1.0e20;
+
 /* ----------------------------------------------------------------------
    compute angular distribution functions for I, J, K atoms
  ---------------------------------------------------------------------- */
@@ -133,15 +135,15 @@ ComputeADF::ComputeADF(LAMMPS *lmp, int narg, char **arg) :
       utils::bounds(FLERR,arg[iarg+1],1,atom->ntypes,jlo[m],jhi[m],error);
       utils::bounds(FLERR,arg[iarg+2],1,atom->ntypes,klo[m],khi[m],error);
       if ((ilo[m] > ihi[m]) || (jlo[m] > jhi[m]) || (klo[m] > khi[m]))
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command index range");
       rcutinnerj[m] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
       rcutouterj[m] = utils::numeric(FLERR,arg[iarg+4],false,lmp);
       if (rcutinnerj[m] < 0.0 || rcutinnerj[m] >= rcutouterj[m])
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command j-cutoff");
       rcutinnerk[m] = utils::numeric(FLERR,arg[iarg+5],false,lmp);
       rcutouterk[m] = utils::numeric(FLERR,arg[iarg+6],false,lmp);
       if (rcutinnerk[m] < 0.0 || rcutinnerk[m] >= rcutouterk[m])
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command k-cutoff");
       iarg += nargsperadf;
     }
   }
@@ -290,8 +292,8 @@ void ComputeADF::init()
     double skin = neighbor->skin;
     mycutneigh = maxouter + skin;
     if (mycutneigh > comm->cutghostuser)
-      error->all(FLERR,"Compute adf outer cutoff exceeds ghost atom range - "
-                 "use comm_modify cutoff command");
+      error->all(FLERR,"Compute adf outer cutoff {} exceeds ghost atom range {} - "
+                 "use comm_modify cutoff command", mycutneigh, comm->cutghostuser);
   }
 
   // assign ordinate values to 1st column of output array
@@ -328,6 +330,7 @@ void ComputeADF::init()
   if (mycutneigh > 0.0) {
     if ((neighbor->style == Neighbor::MULTI) || (neighbor->style == Neighbor::MULTI_OLD))
       error->all(FLERR, "Compute adf with custom cutoffs requires neighbor style 'bin' or 'nsq'");
+
     req->set_cutoff(mycutneigh);
   }
 }
diff --git a/src/EXTRA-DUMP/dump_yaml.cpp b/src/EXTRA-DUMP/dump_yaml.cpp
index 6c21c24f77..c26b0591e2 100644
--- a/src/EXTRA-DUMP/dump_yaml.cpp
+++ b/src/EXTRA-DUMP/dump_yaml.cpp
@@ -94,31 +94,31 @@ void DumpYAML::write_header(bigint ndump)
 
   if (comm->me == 0) {
     const std::string boundary(boundstr);
-    fmt::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
-    if (unit_flag) fmt::print(fp, "units: {}\n", update->unit_style);
-    if (time_flag) fmt::print(fp, "time: {:.16g}\n", compute_time());
+    utils::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
+    if (unit_flag) utils::print(fp, "units: {}\n", update->unit_style);
+    if (time_flag) utils::print(fp, "time: {:.16g}\n", compute_time());
 
-    fmt::print(fp, "natoms: {}\n", ndump);
+    utils::print(fp, "natoms: {}\n", ndump);
     fputs("boundary: [ ", fp);
     for (const auto &bflag : boundary) {
       if (bflag == ' ') continue;
-      fmt::print(fp, "{}, ", bflag);
+      utils::print(fp, "{}, ", bflag);
     }
     fputs("]\n", fp);
 
-    if (thermo) fmt::print(fp, thermo_data);
+    if (thermo) utils::print(fp, thermo_data);
 
-    fmt::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
-    if (domain->triclinic) fmt::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);
+    utils::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
+    if (domain->triclinic) utils::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);
 
-    fmt::print(fp, "keywords: [ ");
+    utils::print(fp, "keywords: [ ");
     for (const auto &item : utils::split_words(columns)) {
       if (item.find_first_of(special_chars) == std::string::npos)
-        fmt::print(fp, "{}, ", item);
+        utils::print(fp, "{}, ", item);
       else
-        fmt::print(fp, "'{}', ", item);
+        utils::print(fp, "'{}', ", item);
     }
     fputs(" ]\ndata:\n", fp);
   } else    // reset so that the remainder of the output is not multi-proc
diff --git a/src/EXTRA-FIX/fix_ave_correlate_long.cpp b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
index abb1ad87de..557e416275 100644
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@@ -489,7 +489,7 @@ void FixAveCorrelateLong::end_of_step()
   if (fp && comm->me == 0) {
     clearerr(fp);
     if (overwrite) (void) platform::fseek(fp,filepos);
-    fmt::print(fp,"# Timestep: {}\n", ntimestep);
+    utils::print(fp,"# Timestep: {}\n", ntimestep);
     for (unsigned int i=0; i < npcorr; ++i) {
       fprintf(fp, "%lg ", t[i]*update->dt*nevery);
       for (int j=0; j < npair; ++j) {
diff --git a/src/EXTRA-FIX/fix_tmd.cpp b/src/EXTRA-FIX/fix_tmd.cpp
index 242efcf41c..aac069dc9d 100644
--- a/src/EXTRA-FIX/fix_tmd.cpp
+++ b/src/EXTRA-FIX/fix_tmd.cpp
@@ -270,7 +270,7 @@ void FixTMD::initial_integrate(int /*vflag*/)
     work_lambda += lambda*(rho_target - rho_old);
     if (!(update->ntimestep % nfileevery) &&
         (previous_stat != update->ntimestep)) {
-      fmt::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
+      utils::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
                  gamma_back,gamma_forward,lambda,work_lambda,work_analytical);
       fflush(fp);
       previous_stat = update->ntimestep;
diff --git a/src/EXTRA-FIX/fix_ttm.cpp b/src/EXTRA-FIX/fix_ttm.cpp
index f5f3aa457e..1609d6285a 100644
--- a/src/EXTRA-FIX/fix_ttm.cpp
+++ b/src/EXTRA-FIX/fix_ttm.cpp
@@ -523,7 +523,7 @@ void FixTTM::write_electron_temperatures(const std::string &filename)
   FILE *fp = fopen(filename.c_str(),"w");
   if (!fp) error->one(FLERR,"Fix ttm could not open output file {}: {}",
                       filename,utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
              "{}x{}x{} grid at step {} - created by fix {}\n", utils::current_date(),
              update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);
 
diff --git a/src/EXTRA-FIX/fix_ttm_grid.cpp b/src/EXTRA-FIX/fix_ttm_grid.cpp
index d9c6f13e2f..49a186a3ee 100644
--- a/src/EXTRA-FIX/fix_ttm_grid.cpp
+++ b/src/EXTRA-FIX/fix_ttm_grid.cpp
@@ -411,7 +411,7 @@ void FixTTMGrid::write_restart_file(const char *file)
     if (fpout == nullptr)
       error->one(FLERR,"Cannot open fix ttm/grid restart file {}: {}",outfile,utils::getsyserror());
 
-    fmt::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
+    utils::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
                "Electron temperature on {}x{}x{} grid at step {} - "
                "created by fix {}\n",
                utils::current_date(),update->unit_style,
diff --git a/src/EXTRA-FIX/fix_ttm_mod.cpp b/src/EXTRA-FIX/fix_ttm_mod.cpp
index 335acdd853..6d6e9c0638 100644
--- a/src/EXTRA-FIX/fix_ttm_mod.cpp
+++ b/src/EXTRA-FIX/fix_ttm_mod.cpp
@@ -74,14 +74,13 @@ static constexpr double SHIFT = 0.0;
 /* ---------------------------------------------------------------------- */
 
 FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg),
-  random(nullptr), gfactor1(nullptr), gfactor2(nullptr), ratio(nullptr), flangevin(nullptr),
-  T_electron(nullptr), T_electron_old(nullptr), net_energy_transfer(nullptr),
-  net_energy_transfer_all(nullptr)
+    Fix(lmp, narg, arg), infile(nullptr), outfile(nullptr), random(nullptr), gfactor1(nullptr),
+    gfactor2(nullptr), ratio(nullptr), flangevin(nullptr), T_electron(nullptr),
+    T_electron_old(nullptr), net_energy_transfer(nullptr), net_energy_transfer_all(nullptr)
 {
   if (lmp->citeme) lmp->citeme->add(cite_fix_ttm_mod);
 
-  if (narg < 8) error->all(FLERR,"Illegal fix ttm/mod command");
+  if (narg < 8) utils::missing_cmd_args(FLERR, "fix ttm/mod", error);
 
   vector_flag = 1;
   size_vector = 2;
@@ -103,27 +102,29 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   int iarg = 8;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"set") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod set", error);
       tinit = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       if (tinit <= 0.0)
         error->all(FLERR,"Fix ttm/mod initial temperature must be > 0.0");
       iarg += 2;
     } else if (strcmp(arg[iarg],"infile") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod infile", error);
+      delete[] infile;
       infile = utils::strdup(arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"outfile") == 0) {
-      if (iarg+3 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod outfile", error);
+      delete[] outfile;
       outevery = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       outfile = utils::strdup(arg[iarg+2]);
       iarg += 3;
-    } else error->all(FLERR,"Illegal fix ttm/mod command");
+    } else error->all(FLERR,"Unknown fix ttm/mod keyword {}", arg[iarg]);
   }
 
   // error check
 
   if (seed <= 0)
-    error->all(FLERR,"Invalid random number seed in fix ttm/mod command");
+    error->all(FLERR,"Invalid random number seed {} in fix ttm/mod command", seed);
   if (nxgrid <= 0 || nygrid <= 0 || nzgrid <= 0)
     error->all(FLERR,"Fix ttm/mod grid sizes must be > 0");
 
@@ -152,7 +153,8 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   if (v_0 < 0.0) error->all(FLERR,"Fix ttm/mod v_0 must be >= 0.0");
   if (ionic_density <= 0.0) error->all(FLERR,"Fix ttm/mod ionic_density must be > 0.0");
   if (surface_l < 0) error->all(FLERR,"Surface coordinates must be >= 0");
-  if (surface_l >= surface_r) error->all(FLERR, "Left surface coordinate must be less than right surface coordinate");
+  if (surface_l >= surface_r)
+    error->all(FLERR, "Left surface coordinate must be less than right surface coordinate");
 
   // initialize Marsaglia RNG with processor-unique seed
 
@@ -168,10 +170,8 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   memory->create(T_electron_old,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron_old");
   memory->create(T_electron_first,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron_first");
   memory->create(T_electron,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron");
-  memory->create(net_energy_transfer,nzgrid,nygrid,nxgrid,
-                 "ttm/mod:net_energy_transfer");
-  memory->create(net_energy_transfer_all,nzgrid,nygrid,nxgrid,
-                 "ttm/mod:net_energy_transfer_all");
+  memory->create(net_energy_transfer,nzgrid,nygrid,nxgrid,"ttm/mod:net_energy_transfer");
+  memory->create(net_energy_transfer_all,nzgrid,nygrid,nxgrid,"ttm/mod:net_energy_transfer_all");
 
   flangevin = nullptr;
   grow_arrays(atom->nmax);
@@ -628,7 +628,7 @@ void FixTTMMod::write_electron_temperatures(const std::string &filename)
   FILE *fp = fopen(filename.c_str(),"w");
   if (!fp) error->one(FLERR,"Fix ttm/mod could not open output file {}: {}",
                       filename, utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
              "{}x{}x{} grid at step {}. Created by fix {}\n", utils::current_date(),
              update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);
 
diff --git a/src/EXTRA-MOLECULE/angle_gaussian.cpp b/src/EXTRA-MOLECULE/angle_gaussian.cpp
index a5d469559f..004fdd15ab 100644
--- a/src/EXTRA-MOLECULE/angle_gaussian.cpp
+++ b/src/EXTRA-MOLECULE/angle_gaussian.cpp
@@ -29,7 +29,7 @@ using namespace LAMMPS_NS;
 using namespace MathConst;
 
 static constexpr double SMALL = 0.001;
-static constexpr double SMALLG = 2.0e-308;
+static constexpr double SMALLG = 2.3e-308;
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/EXTRA-MOLECULE/bond_gaussian.cpp b/src/EXTRA-MOLECULE/bond_gaussian.cpp
index 2ed9e06799..deb37042ad 100644
--- a/src/EXTRA-MOLECULE/bond_gaussian.cpp
+++ b/src/EXTRA-MOLECULE/bond_gaussian.cpp
@@ -27,7 +27,7 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
-static constexpr double SMALL = 2.0e-308;
+static constexpr double SMALL = 2.3e-308;
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/EXTRA-PAIR/pair_dispersion_d3.cpp b/src/EXTRA-PAIR/pair_dispersion_d3.cpp
index 6e2b3c1337..ae1ec4dc25 100644
--- a/src/EXTRA-PAIR/pair_dispersion_d3.cpp
+++ b/src/EXTRA-PAIR/pair_dispersion_d3.cpp
@@ -62,7 +62,9 @@ static constexpr double autoev = 27.21140795;    // atomic units (Hartree) to eV
    Constructor (Required)
 ------------------------------------------------------------------------- */
 
-PairDispersionD3::PairDispersionD3(LAMMPS *lmp) : Pair(lmp)
+PairDispersionD3::PairDispersionD3(LAMMPS *lmp) :
+    Pair(lmp), r2r4(nullptr), rcov(nullptr), mxci(nullptr), r0ab(nullptr), c6ab(nullptr),
+    cn(nullptr), dc6(nullptr)
 {
   nmax = 0;
   comm_forward = 2;
@@ -72,6 +74,8 @@ PairDispersionD3::PairDispersionD3(LAMMPS *lmp) : Pair(lmp)
   manybody_flag = 1;
   one_coeff = 1;
   single_enable = 0;
+
+  s6 = s8 = s18 = rs6 = rs8 = rs18 = a1 = a2 = alpha = alpha6 = alpha8 = 0.0;
 }
 
 /* ----------------------------------------------------------------------
@@ -481,8 +485,6 @@ void PairDispersionD3::compute(int eflag, int vflag)
     int jnum = numneigh[i];
     int *jlist = firstneigh[i];
 
-    // fprintf(stderr, "> i, type[i], CN[i], C6[i,i] :  %d, %d, %f, %f\n", atom->tag[i], type[i], cn[i], get_dC6(type[i],type[i],cn[i],cn[i])[0]/(autoev*pow(autoang,6)));
-
     for (int jj = 0; jj < jnum; jj++) {
 
       int j = jlist[jj];
@@ -513,6 +515,7 @@ void PairDispersionD3::compute(int eflag, int vflag)
 
         double t6, t8, damp6, damp8, e6, e8;
         double tmp6, tmp8, fpair1, fpair2, fpair;
+        t6 = t8 = e6 = e8 = evdwl = fpair = fpair1 = fpair2 = 0.0;
 
         switch (dampingCode) {
           case 1: {    // zero
@@ -1134,8 +1137,8 @@ void PairDispersionD3::set_funcpar(std::string &functional_name)
           a1 = 0.3065;
           s8 = 0.9147;
           a2 = 5.0570;
-          break;
           s6 = 0.64;
+          break;
         case 17:
           a1 = 0.0000;
           s8 = 0.2130;
diff --git a/src/EXTRA-PAIR/pair_dispersion_d3.h b/src/EXTRA-PAIR/pair_dispersion_d3.h
index 58575c8a2e..32d1fadec1 100644
--- a/src/EXTRA-PAIR/pair_dispersion_d3.h
+++ b/src/EXTRA-PAIR/pair_dispersion_d3.h
@@ -27,12 +27,10 @@ namespace LAMMPS_NS {
 class PairDispersionD3 : public Pair {
 
  public:
-
   PairDispersionD3(class LAMMPS *);
   ~PairDispersionD3() override;
 
   void compute(int, int) override;
-
   void settings(int, char **) override;
   void coeff(int, char **) override;
   void init_style() override;
@@ -45,45 +43,39 @@ class PairDispersionD3 : public Pair {
   void unpack_reverse_comm(int, int *, double *) override;
 
  protected:
-
   int nmax;
-  double evdwl;
 
-  double rthr;                          // R^2 distance to cutoff for D3_calculation
-  double cn_thr;                        // R^2 distance to cutoff for CN_calculation
+  double rthr;      // R^2 distance to cutoff for D3_calculation
+  double cn_thr;    // R^2 distance to cutoff for CN_calculation
 
-  std::string damping_type;             // damping function type
-  double s6, s8, s18, rs6, rs8, rs18;   // XC parameters
+  std::string damping_type;              // damping function type
+  double s6, s8, s18, rs6, rs8, rs18;    // XC parameters
   double a1, a2, alpha, alpha6, alpha8;
 
-  double* r2r4 = nullptr;               // scale r4/r2 values of the atoms by sqrt(Z)
-  double* rcov = nullptr;               // covalent radii
-  int* mxci = nullptr;                  // How large the grid for c6 interpolation
+  double *r2r4;        // scale r4/r2 values of the atoms by sqrt(Z)
+  double *rcov;        // covalent radii
+  int *mxci;           // How large the grid for c6 interpolation
+  double **r0ab;       // cut-off radii for all element pairs
+  double *****c6ab;    // C6 for all element pairs
+  double *cn;          // Coordination numbers
+  double *dc6;         // dC6i(iat) saves dE_dsp/dCN(iat)
 
-  double** r0ab = nullptr;              // cut-off radii for all element pairs
-  double***** c6ab = nullptr;           // C6 for all element pairs
-
-  double* cn = nullptr;                 // Coordination numbers
-  double* dc6 = nullptr;                // dC6i(iat) saves dE_dsp/dCN(iat)
-
-  int communicationStage;               // communication stage
+  int communicationStage;    // communication stage
 
   void allocate();
-  virtual void set_funcpar(std::string&);
+  virtual void set_funcpar(std::string &);
 
   void calc_coordination_number();
 
-  int find_atomic_number(std::string&);
-  std::vector<int> is_int_in_array(int*, int, int);
+  int find_atomic_number(std::string &);
+  std::vector<int> is_int_in_array(int *, int, int);
 
-  void read_r0ab(int*, int);
-  void set_limit_in_pars_array(int&, int&, int&, int&);
-  void read_c6ab(int*, int);
+  void read_r0ab(int *, int);
+  void set_limit_in_pars_array(int &, int &, int &, int &);
+  void read_c6ab(int *, int);
 
-  double* get_dC6(int, int, double, double);
+  double *get_dC6(int, int, double, double);
 };
-
 }    // namespace LAMMPS_NS
-
 #endif
 #endif
diff --git a/src/GRANULAR/fix_wall_gran.cpp b/src/GRANULAR/fix_wall_gran.cpp
index 3336a8a4d7..4832f07849 100644
--- a/src/GRANULAR/fix_wall_gran.cpp
+++ b/src/GRANULAR/fix_wall_gran.cpp
@@ -163,6 +163,7 @@ FixWallGran::FixWallGran(LAMMPS *lmp, int narg, char **arg) :
   } else if (strcmp(arg[iarg],"region") == 0) {
     if (narg < iarg+2) error->all(FLERR,"Illegal fix wall/gran command");
     wallstyle = REGION;
+    delete[] idregion;
     idregion = utils::strdup(arg[iarg+1]);
     iarg += 2;
     // This option is only compatible with fix wall/gran/region
@@ -205,6 +206,7 @@ FixWallGran::FixWallGran(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"temperature") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal fix wall/gran command");
       if (utils::strmatch(arg[iarg+1], "^v_")) {
+        delete[] tstr;
         tstr = utils::strdup(arg[iarg+1] + 2);
       } else {
         Twall = utils::numeric(FLERR,arg[iarg+1],false,lmp);
diff --git a/src/INTEL/verlet_lrt_intel.cpp b/src/INTEL/verlet_lrt_intel.cpp
index 9df17d8cef..3e18295461 100644
--- a/src/INTEL/verlet_lrt_intel.cpp
+++ b/src/INTEL/verlet_lrt_intel.cpp
@@ -94,7 +94,7 @@ void VerletLRTIntel::setup(int flag)
   if (comm->me == 0 && screen) {
     fputs("Setting up VerletLRTIntel run ...\n",screen);
     if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n"
+      utils::print(screen,"  Unit style    : {}\n"
                         "  Current step  : {}\n"
                         "  Time step     : {}\n",
                  update->unit_style,update->ntimestep,update->dt);
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 1ec3646de2..d34d5eb9ee 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
+action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
+action compute_sna_grid_kokkos.h compute_sna_grid.h
+action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
+action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
+action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
+action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
@@ -230,7 +238,6 @@ action fix_wall_region_kokkos.cpp
 action fix_wall_region_kokkos.h
 action grid3d_kokkos.cpp fft3d.h
 action grid3d_kokkos.h fft3d.h
-action group_kokkos.cpp
 action group_kokkos.h
 action improper_class2_kokkos.cpp improper_class2.cpp
 action improper_class2_kokkos.h improper_class2.h
diff --git a/src/KOKKOS/angle_harmonic_kokkos.cpp b/src/KOKKOS/angle_harmonic_kokkos.cpp
index 2b3c283732..26c70a2760 100644
--- a/src/KOKKOS/angle_harmonic_kokkos.cpp
+++ b/src/KOKKOS/angle_harmonic_kokkos.cpp
@@ -72,14 +72,14 @@ void AngleHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // reallocate per-atom arrays if necessary
 
   if (eflag_atom) {
-    if(k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
     memoryKK->destroy_kokkos(k_eatom,eatom);
     memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"angle:eatom");
     d_eatom = k_eatom.template view<DeviceType>();
     } else Kokkos::deep_copy(d_eatom,0.0);
   }
   if (vflag_atom) {
-    if(k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
     memoryKK->destroy_kokkos(k_vatom,vatom);
     memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"angle:vatom");
     d_vatom = k_vatom.template view<DeviceType>();
diff --git a/src/KOKKOS/angle_hybrid_kokkos.cpp b/src/KOKKOS/angle_hybrid_kokkos.cpp
index 06b2845545..cbdf7bd942 100644
--- a/src/KOKKOS/angle_hybrid_kokkos.cpp
+++ b/src/KOKKOS/angle_hybrid_kokkos.cpp
@@ -76,7 +76,7 @@ void AngleHybridKokkos::compute(int eflag, int vflag)
 
     Kokkos::parallel_for(nanglelist_orig,LAMMPS_LAMBDA(int i) {
       const int m = d_map[d_anglelist_orig(i,3)];
-      if (m >= 0) Kokkos::atomic_increment(&d_nanglelist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_nanglelist[m]);
     });
 
     k_nanglelist.modify_device();
@@ -87,7 +87,7 @@ void AngleHybridKokkos::compute(int eflag, int vflag)
       if (h_nanglelist[m] > maxangle_all)
         maxangle_all = h_nanglelist[m] + EXTRA;
 
-    if (k_anglelist.d_view.extent(1) < maxangle_all)
+    if ((int)k_anglelist.d_view.extent(1) < maxangle_all)
       MemKK::realloc_kokkos(k_anglelist, "angle_hybrid:anglelist", nstyles, maxangle_all, 4);
     auto d_anglelist = k_anglelist.d_view;
 
diff --git a/src/KOKKOS/bond_harmonic_kokkos.cpp b/src/KOKKOS/bond_harmonic_kokkos.cpp
index 7e12400c9b..488b461bc2 100644
--- a/src/KOKKOS/bond_harmonic_kokkos.cpp
+++ b/src/KOKKOS/bond_harmonic_kokkos.cpp
@@ -67,14 +67,14 @@ void BondHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // reallocate per-atom arrays if necessary
 
   if (eflag_atom) {
-    if (k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
       memoryKK->destroy_kokkos(k_eatom,eatom);
       memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"improper:eatom");
       d_eatom = k_eatom.template view<KKDeviceType>();
     } else Kokkos::deep_copy(d_eatom,0.0);
   }
   if (vflag_atom) {
-    if (k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
       memoryKK->destroy_kokkos(k_vatom,vatom);
       memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"improper:vatom");
       d_vatom = k_vatom.template view<KKDeviceType>();
diff --git a/src/KOKKOS/bond_hybrid_kokkos.cpp b/src/KOKKOS/bond_hybrid_kokkos.cpp
index db247c7100..4fa3abff4f 100644
--- a/src/KOKKOS/bond_hybrid_kokkos.cpp
+++ b/src/KOKKOS/bond_hybrid_kokkos.cpp
@@ -76,7 +76,7 @@ void BondHybridKokkos::compute(int eflag, int vflag)
 
     Kokkos::parallel_for(nbondlist_orig,LAMMPS_LAMBDA(int i) {
       const int m = d_map[d_bondlist_orig(i,2)];
-      if (m >= 0) Kokkos::atomic_increment(&d_nbondlist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_nbondlist[m]);
     });
 
     k_nbondlist.modify_device();
@@ -87,7 +87,7 @@ void BondHybridKokkos::compute(int eflag, int vflag)
       if (h_nbondlist[m] > maxbond_all)
         maxbond_all = h_nbondlist[m] + EXTRA;
 
-    if (k_bondlist.d_view.extent(1) < maxbond_all)
+    if ((int)k_bondlist.d_view.extent(1) < maxbond_all)
       MemKK::realloc_kokkos(k_bondlist, "bond_hybrid:bondlist", nstyles, maxbond_all, 3);
     auto d_bondlist = k_bondlist.d_view;
 
diff --git a/src/KOKKOS/comm_tiled_kokkos.cpp b/src/KOKKOS/comm_tiled_kokkos.cpp
index afddc079f4..7222ed4fb2 100644
--- a/src/KOKKOS/comm_tiled_kokkos.cpp
+++ b/src/KOKKOS/comm_tiled_kokkos.cpp
@@ -37,6 +37,8 @@ static constexpr int BUFEXTRA = 1000;
 CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp) : CommTiled(_lmp)
 {
   sendlist = nullptr;
+  maxsendlist = nullptr;
+  nprocmaxtot = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -49,6 +51,8 @@ CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp) : CommTiled(_lmp)
 CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp, Comm *oldcomm) : CommTiled(_lmp,oldcomm)
 {
   sendlist = nullptr;
+  maxsendlist = nullptr;
+  nprocmaxtot = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -56,7 +60,9 @@ CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp, Comm *oldcomm) : CommTiled(_lmp,o
 CommTiledKokkos::~CommTiledKokkos()
 {
   memoryKK->destroy_kokkos(k_sendlist,sendlist);
+  memory->destroy(maxsendlist);
   sendlist = nullptr;
+  maxsendlist = nullptr;
   buf_send = nullptr;
   buf_recv = nullptr;
 }
@@ -657,12 +663,11 @@ void CommTiledKokkos::grow_list(int iswap, int iwhich, int n)
   k_sendlist.sync<LMPHostType>();
   k_sendlist.modify<LMPHostType>();
 
-  if (size > (int)k_sendlist.extent(2)) {
-    memoryKK->grow_kokkos(k_sendlist,sendlist,maxswap,maxsend,size,"comm:sendlist");
+  memoryKK->grow_kokkos(k_sendlist,sendlist,maxswap,nprocmaxtot,size,"comm:sendlist");
 
-    for (int i = 0; i < maxswap; i++)
-      maxsendlist[iswap][iwhich] = size;
-  }
+  for (int i = 0; i < maxswap; i++)
+    for (int j = 0; j < nprocmaxtot; j++)
+      maxsendlist[i][j] = size;
 }
 
 /* ----------------------------------------------------------------------
@@ -692,24 +697,23 @@ void CommTiledKokkos::grow_swap_send(int i, int n, int /*nold*/)
   memory->destroy(sendbox_multiold[i]);
   memory->create(sendbox_multiold[i],n,atom->ntypes+1,6,"comm:sendbox_multiold");
 
-  delete [] maxsendlist[i];
-  maxsendlist[i] = new int[n];
-
-  for (int j = 0; j < n; j++)
-    maxsendlist[i][j] = BUFMIN;
-
-  if (sendlist && !k_sendlist.d_view.data()) {
-    for (int ii = 0; ii < maxswap; ii++) {
-      if (sendlist[ii]) {
-        for (int jj = 0; jj < nprocmax[ii]; jj++)
-          memory->destroy(sendlist[ii][jj]);
-        delete [] sendlist[ii];
-      }
-    }
+  if (sendlist && !k_sendlist.h_view.data()) {
     delete [] sendlist;
+    delete [] maxsendlist;
+
+    sendlist = nullptr;
+    maxsendlist = nullptr;
   } else {
     memoryKK->destroy_kokkos(k_sendlist,sendlist);
+    memory->destroy(maxsendlist);
   }
 
-  memoryKK->create_kokkos(k_sendlist,sendlist,maxswap,n,BUFMIN,"comm:sendlist");
+  nprocmaxtot = MAX(nprocmaxtot,n);
+
+  memoryKK->create_kokkos(k_sendlist,sendlist,maxswap,nprocmaxtot,BUFMIN,"comm:sendlist");
+  memory->create(maxsendlist,maxswap,nprocmaxtot,"comm:maxsendlist");
+
+  for (int i = 0; i < maxswap; i++)
+    for (int j = 0; j < nprocmaxtot; j++)
+      maxsendlist[i][j] = BUFMIN;
 }
diff --git a/src/KOKKOS/comm_tiled_kokkos.h b/src/KOKKOS/comm_tiled_kokkos.h
index ef226489c8..67036e0a2f 100644
--- a/src/KOKKOS/comm_tiled_kokkos.h
+++ b/src/KOKKOS/comm_tiled_kokkos.h
@@ -64,18 +64,17 @@ class CommTiledKokkos : public CommTiled {
   template<class DeviceType> void reverse_comm_device();
 
  protected:
+  int nprocmaxtot;
 
   DAT::tdual_int_3d k_sendlist;
-  //DAT::tdual_int_scalar k_total_send;
   DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
-  //DAT::tdual_int_scalar k_count;
 
-  void grow_send(int, int) override;
-  void grow_recv(int, int flag = 0) override;
+  void grow_send(int, int) override;             // reallocate send buffer
+  void grow_recv(int, int flag = 0) override;    // free/allocate recv buffer
   void grow_send_kokkos(int, int, ExecutionSpace space = Host);
   void grow_recv_kokkos(int, int, ExecutionSpace space = Host);
-  void grow_list(int, int, int) override;
-  void grow_swap_send(int, int, int) override;     // grow swap arrays for send and recv
+  void grow_list(int, int, int) override;        // reallocate sendlist for one swap/proc
+  void grow_swap_send(int, int, int) override;   // grow swap arrays for send and recv
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
new file mode 100644
index 0000000000..cfd7e5a582
--- /dev/null
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -0,0 +1,327 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Drew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeGaussianGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+  // Set up element lists
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
+  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
+  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
+  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
+  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
+  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_sigmaelem(i-1) = sigmaelem[i];
+    h_prefacelem(i-1) = prefacelem[i];
+    h_argfacelem(i-1) = argfacelem[i];
+  }
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
+  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
+  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
+  Kokkos::deep_copy(d_map,h_map);
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+  //gridlocal_allocated = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::init()
+{
+  ComputeGaussianGridLocal::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
+{
+  if (host_flag) {
+    return;
+  }
+
+  invoked_local = update->ntimestep;
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  // printf(">>> total_range: %d\n", total_range);
+  chunksize = 32768; // 100*32768
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
+
+  if (triclinic){
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+  const int itype = 1;
+  int ielem = 0;
+  ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq < rnd_cutsq(jtype, jtype) ) {
+      int icol = size_local_cols_base + jtype - 1;
+      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   check max team size
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+namespace LAMMPS_NS {
+template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeGaussianGridLocalKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
new file mode 100644
index 0000000000..34e12bc4af
--- /dev/null
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -0,0 +1,96 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
+// clang-format on
+
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+
+#include "compute_gaussian_grid_local.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+// clang-format off
+struct TagComputeGaussianGridLocalNeigh{};
+// clang-format on
+
+template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+#endif
+
+  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocalKokkos() override;
+  void setup() override;
+  void init() override;
+  void compute_local() override;
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
+
+ private:
+  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<double*, DeviceType> d_sigmaelem;
+  Kokkos::View<double*, DeviceType> d_prefacelem;
+  Kokkos::View<double*, DeviceType> d_argfacelem;
+  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+
+  int max_neighs, inum, chunk_size, chunk_offset;
+  int host_flag;
+  int total_range; // total number of loop iterations in grid
+  int xlen, ylen, zlen;
+  int chunksize;
+  int ntotal;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
new file mode 100644
index 0000000000..197234cf1d
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "compute_sna_grid_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkosHost<LMPHostType>;
+#endif
+
+}
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
new file mode 100644
index 0000000000..8a7d87acbb
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -0,0 +1,297 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridComputeZi{};
+template <bool chemsnap> struct TagCSNAGridComputeBi{};
+struct TagCSNAGridLocalFill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
new file mode 100644
index 0000000000..665a1b67e7
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -0,0 +1,786 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_gridall, gridall);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  if (host_flag) {
+    ComputeSNAGrid::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
+
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+#endif
+
+}
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
new file mode 100644
index 0000000000..3835a56bf8
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "compute_sna_grid_local_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
+#endif
+
+}
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
new file mode 100644
index 0000000000..2ffc050b2d
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -0,0 +1,288 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+
+#include "compute_sna_grid_local.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridLocalComputeNeigh{};
+struct TagCSNAGridLocalComputeCayleyKlein{};
+struct TagCSNAGridLocalPreUi{};
+struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
+template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
+struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLocalLoop{};
+struct TagComputeSNAGridLocal3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLocalLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridLocalKokkos() override;
+
+  void setup() override;
+  void compute_local() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
new file mode 100644
index 0000000000..01bb2b427b
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -0,0 +1,783 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Andrew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
+{
+  if (host_flag) {
+    ComputeSNAGridLocal::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  //chunk_size = total_range;
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridLocalComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocal2Fill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+#endif
+
+}
diff --git a/src/KOKKOS/dihedral_harmonic_kokkos.cpp b/src/KOKKOS/dihedral_harmonic_kokkos.cpp
index 05babd69b4..8575cc1807 100644
--- a/src/KOKKOS/dihedral_harmonic_kokkos.cpp
+++ b/src/KOKKOS/dihedral_harmonic_kokkos.cpp
@@ -75,14 +75,14 @@ void DihedralHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // reallocate per-atom arrays if necessary
 
   if (eflag_atom) {
-    if(k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
     memoryKK->destroy_kokkos(k_eatom,eatom);
     memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"dihedral:eatom");
     d_eatom = k_eatom.view<DeviceType>();
     } else Kokkos::deep_copy(d_eatom,0.0);
   }
   if (vflag_atom) {
-    if(k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
     memoryKK->destroy_kokkos(k_vatom,vatom);
     memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"dihedral:vatom");
     d_vatom = k_vatom.view<DeviceType>();
diff --git a/src/KOKKOS/dihedral_hybrid_kokkos.cpp b/src/KOKKOS/dihedral_hybrid_kokkos.cpp
index 88dbeaf13b..60eb2dc0a7 100644
--- a/src/KOKKOS/dihedral_hybrid_kokkos.cpp
+++ b/src/KOKKOS/dihedral_hybrid_kokkos.cpp
@@ -76,7 +76,7 @@ void DihedralHybridKokkos::compute(int eflag, int vflag)
 
     Kokkos::parallel_for(ndihedrallist_orig,LAMMPS_LAMBDA(int i) {
       const int m = d_map[d_dihedrallist_orig(i,4)];
-      if (m >= 0) Kokkos::atomic_increment(&d_ndihedrallist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_ndihedrallist[m]);
     });
 
     k_ndihedrallist.modify_device();
@@ -87,7 +87,7 @@ void DihedralHybridKokkos::compute(int eflag, int vflag)
       if (h_ndihedrallist[m] > maxdihedral_all)
         maxdihedral_all = h_ndihedrallist[m] + EXTRA;
 
-    if (k_dihedrallist.d_view.extent(1) < maxdihedral_all)
+    if ((int)k_dihedrallist.d_view.extent(1) < maxdihedral_all)
       MemKK::realloc_kokkos(k_dihedrallist, "dihedral_hybrid:dihedrallist", nstyles, maxdihedral_all, 5);
     auto d_dihedrallist = k_dihedrallist.d_view;
 
diff --git a/src/KOKKOS/fix_cmap_kokkos.cpp b/src/KOKKOS/fix_cmap_kokkos.cpp
index dd92afe9cc..b3149ba84d 100644
--- a/src/KOKKOS/fix_cmap_kokkos.cpp
+++ b/src/KOKKOS/fix_cmap_kokkos.cpp
@@ -690,7 +690,7 @@ int FixCMAPKokkos<DeviceType>::pack_exchange_kokkos(
 
   copymode = 1;
 
-  Kokkos::parallel_scan(nsend, KOKKOS_LAMBDA(const int &mysend, int &offset, const bool &final) {
+  Kokkos::parallel_scan(Kokkos::RangePolicy<DeviceType>(0,nsend), KOKKOS_LAMBDA(const int &mysend, int &offset, const bool &final) {
 
     const int i = d_exchange_sendlist(mysend);
 
@@ -782,7 +782,7 @@ void FixCMAPKokkos<DeviceType>::unpack_exchange_kokkos(
 
   copymode = 1;
 
-  Kokkos::parallel_for(nrecv, KOKKOS_LAMBDA(const int &i) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nrecv), KOKKOS_LAMBDA(const int &i) {
     int index = d_indices(i);
     if (index > -1) {
       int m = d_ubuf(d_buf(i)).i;
diff --git a/src/KOKKOS/fix_langevin_kokkos.cpp b/src/KOKKOS/fix_langevin_kokkos.cpp
index 546f204de6..c149ebda6a 100644
--- a/src/KOKKOS/fix_langevin_kokkos.cpp
+++ b/src/KOKKOS/fix_langevin_kokkos.cpp
@@ -39,7 +39,12 @@ enum { CONSTANT, EQUAL, ATOM };
 
 template<class DeviceType>
 FixLangevinKokkos<DeviceType>::FixLangevinKokkos(LAMMPS *lmp, int narg, char **arg) :
-  FixLangevin(lmp, narg, arg),rand_pool(seed + comm->me)
+  FixLangevin(lmp, narg, arg),
+#ifdef LMP_KOKKOS_DEBUG_RNG
+  rand_pool(seed + comm->me, lmp)
+#else
+  rand_pool(seed + comm->me)
+#endif
 {
   kokkosable = 1;
   fuse_integrate_flag = 1;
@@ -48,43 +53,42 @@ FixLangevinKokkos<DeviceType>::FixLangevinKokkos(LAMMPS *lmp, int narg, char **a
   int ntypes = atomKK->ntypes;
 
   // allocate per-type arrays for force prefactors
+  delete[] gfactor1;
+  delete[] gfactor2;
+  delete[] ratio;
   memoryKK->create_kokkos(k_gfactor1,gfactor1,ntypes+1,"langevin:gfactor1");
   memoryKK->create_kokkos(k_gfactor2,gfactor2,ntypes+1,"langevin:gfactor2");
   memoryKK->create_kokkos(k_ratio,ratio,ntypes+1,"langevin:ratio");
   d_gfactor1 = k_gfactor1.template view<DeviceType>();
-  h_gfactor1 = k_gfactor1.template view<LMPHostType>();
+  h_gfactor1 = k_gfactor1.h_view;
   d_gfactor2 = k_gfactor2.template view<DeviceType>();
-  h_gfactor2 = k_gfactor2.template view<LMPHostType>();
+  h_gfactor2 = k_gfactor2.h_view;
   d_ratio = k_ratio.template view<DeviceType>();
-  h_ratio = k_ratio.template view<LMPHostType>();
+  h_ratio = k_ratio.h_view;
 
   // optional args
   for (int i = 1; i <= ntypes; i++) ratio[i] = 1.0;
-  k_ratio.template modify<LMPHostType>();
+  k_ratio.modify_host();
 
   if (gjfflag) {
+    memory->destroy(franprev);
+    memory->destroy(lv);
     grow_arrays(atomKK->nmax);
-    atom->add_callback(Atom::GROW);
+
     // initialize franprev to zero
-    for (int i = 0; i < atomKK->nlocal; i++) {
-      franprev[i][0] = 0.0;
-      franprev[i][1] = 0.0;
-      franprev[i][2] = 0.0;
-      lv[i][0] = 0.0;
-      lv[i][1] = 0.0;
-      lv[i][2] = 0.0;
-    }
-    k_franprev.template modify<LMPHostType>();
-    k_lv.template modify<LMPHostType>();
+
+    Kokkos::deep_copy(d_franprev,0.0);
+    Kokkos::deep_copy(d_lv,0.0);
   }
+
   if (zeroflag) {
     k_fsumall = tdual_double_1d_3n("langevin:fsumall");
-    h_fsumall = k_fsumall.template view<LMPHostType>();
+    h_fsumall = k_fsumall.h_view;
     d_fsumall = k_fsumall.template view<DeviceType>();
   }
 
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read =  V_MASK | F_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK;
+  datamask_read = V_MASK | F_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK;
   datamask_modify = F_MASK;
 }
 
@@ -93,13 +97,21 @@ FixLangevinKokkos<DeviceType>::FixLangevinKokkos(LAMMPS *lmp, int narg, char **a
 template<class DeviceType>
 FixLangevinKokkos<DeviceType>::~FixLangevinKokkos()
 {
+  if (copymode) return;
+
   memoryKK->destroy_kokkos(k_gfactor1,gfactor1);
   memoryKK->destroy_kokkos(k_gfactor2,gfactor2);
   memoryKK->destroy_kokkos(k_ratio,ratio);
   memoryKK->destroy_kokkos(k_flangevin,flangevin);
-  if (gjfflag) memoryKK->destroy_kokkos(k_franprev,franprev);
-  if (gjfflag) memoryKK->destroy_kokkos(k_lv,lv);
+  if (gjfflag) {
+    memoryKK->destroy_kokkos(k_franprev,franprev);
+    memoryKK->destroy_kokkos(k_lv,lv);
+  }
   memoryKK->destroy_kokkos(k_tforce,tforce);
+
+#ifdef LMP_KOKKOS_DEBUG_RNG
+  rand_pool.destroy();
+#endif
 }
 
 /* ---------------------------------------------------------------------- */
@@ -118,8 +130,170 @@ void FixLangevinKokkos<DeviceType>::init()
     error->warning(FLERR,"Fix langevin gjf + kokkos is not implemented with random gaussians");
 
   // prefactors are modified in the init
-  k_gfactor1.template modify<LMPHostType>();
-  k_gfactor2.template modify<LMPHostType>();
+  k_gfactor1.modify_host();
+  k_gfactor2.modify_host();
+
+#ifdef LMP_KOKKOS_DEBUG_RNG
+  rand_pool.init(random,seed + comm->me);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixLangevinKokkos<DeviceType>::setup(int vflag)
+{
+  if (gjfflag) {
+    double dt = update->dt;
+    double ftm2v = force->ftm2v;
+    auto v = atomKK->k_v.view<DeviceType>();
+    auto f = atomKK->k_f.view<DeviceType>();
+    auto mask = atomKK->k_mask.view<DeviceType>();
+    int nlocal = atom->nlocal;
+    auto rmass = atomKK->k_rmass.view<DeviceType>();
+    auto mass = atomKK->k_mass.view<DeviceType>();
+    auto type = atomKK->k_type.view<DeviceType>();
+    auto groupbit = this->groupbit;
+    auto gjfa = this->gjfa;
+    auto gjfsib = this->gjfsib;
+
+    if (atom->rmass) {
+      atomKK->sync(execution_space,V_MASK|F_MASK|MASK_MASK|RMASS_MASK);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          const double dtfm = ftm2v * 0.5 * dt / rmass[i];
+          v(i,0) -= dtfm * f(i,0);
+          v(i,1) -= dtfm * f(i,1);
+          v(i,2) -= dtfm * f(i,2);
+        }
+      });
+
+      if (tbiasflag) {
+        // account for bias velocity
+        if (temperature->kokkosable) {
+          temperature->compute_scalar();
+          temperature->remove_bias_all_kk();
+        } else {
+          atomKK->sync(temperature->execution_space,temperature->datamask_read);
+          temperature->compute_scalar();
+          temperature->remove_bias_all();
+          atomKK->modified(temperature->execution_space,temperature->datamask_modify);
+          atomKK->sync(execution_space,temperature->datamask_modify);
+        }
+      }
+
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          v(i,0) /= gjfa * gjfsib * gjfsib;
+          v(i,1) /= gjfa * gjfsib * gjfsib;
+          v(i,2) /= gjfa * gjfsib * gjfsib;
+        }
+      });
+
+      if (tbiasflag) {
+        if (temperature->kokkosable) temperature->restore_bias_all();
+        else {
+          atomKK->sync(temperature->execution_space,temperature->datamask_read);
+          temperature->restore_bias_all();
+          atomKK->modified(temperature->execution_space,temperature->datamask_modify);
+          atomKK->sync(execution_space,temperature->datamask_modify);
+        }
+      }
+
+    } else {
+      atomKK->sync(execution_space,V_MASK|F_MASK|MASK_MASK|TYPE_MASK);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          const double dtfm = ftm2v * 0.5 * dt / mass[type[i]];
+          v(i,0) -= dtfm * f(i,0);
+          v(i,1) -= dtfm * f(i,1);
+          v(i,2) -= dtfm * f(i,2);
+        }
+      });
+
+      if (tbiasflag) {
+        // account for bias velocity
+        if (temperature->kokkosable) {
+          temperature->compute_scalar();
+          temperature->remove_bias_all_kk();
+        } else {
+          atomKK->sync(temperature->execution_space,temperature->datamask_read);
+          temperature->compute_scalar();
+          temperature->remove_bias_all();
+          atomKK->modified(temperature->execution_space,temperature->datamask_modify);
+          atomKK->sync(execution_space,temperature->datamask_modify);
+        }
+      }
+
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          v(i,0) /= gjfa * gjfsib * gjfsib;
+          v(i,1) /= gjfa * gjfsib * gjfsib;
+          v(i,2) /= gjfa * gjfsib * gjfsib;
+        }
+      });
+
+      if (tbiasflag) {
+        if (temperature->kokkosable) temperature->restore_bias_all();
+        else {
+          atomKK->sync(temperature->execution_space,temperature->datamask_read);
+          temperature->restore_bias_all();
+          atomKK->modified(temperature->execution_space,temperature->datamask_modify);
+          atomKK->sync(execution_space,temperature->datamask_modify);
+        }
+      }
+
+    }
+    atomKK->modified(execution_space,V_MASK);
+  }
+
+  post_force(vflag);
+
+  if (gjfflag) {
+    double dt = update->dt;
+    double ftm2v = force->ftm2v;
+    auto f = atomKK->k_f.view<DeviceType>();
+    auto v = atomKK->k_v.view<DeviceType>();
+    auto mask = atomKK->k_mask.view<DeviceType>();
+    int nlocal = atom->nlocal;
+    auto rmass = atomKK->k_rmass.view<DeviceType>();
+    auto mass = atomKK->k_mass.view<DeviceType>();
+    auto type = atomKK->k_type.view<DeviceType>();
+    auto groupbit = this->groupbit;
+
+    k_lv.template sync<DeviceType>();
+    auto l_lv = d_lv;
+
+    if (atom->rmass) {
+      atomKK->sync(execution_space,V_MASK|F_MASK|MASK_MASK|RMASS_MASK);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          const double dtfm = ftm2v * 0.5 * dt / rmass[i];
+          v(i,0) += dtfm * f(i,0);
+          v(i,1) += dtfm * f(i,1);
+          v(i,2) += dtfm * f(i,2);
+          l_lv(i,0) = v(i,0);
+          l_lv(i,1) = v(i,1);
+          l_lv(i,2) = v(i,2);
+        }
+      });
+    } else {
+      atomKK->sync(execution_space,V_MASK|F_MASK|MASK_MASK|TYPE_MASK);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int &i) {
+        if (mask[i] & groupbit) {
+          const double dtfm = ftm2v * 0.5 * dt / mass[type[i]];
+          v(i,0) += dtfm * f(i,0);
+          v(i,1) += dtfm * f(i,1);
+          v(i,2) += dtfm * f(i,2);
+          l_lv(i,0) = v(i,0);
+          l_lv(i,1) = v(i,1);
+          l_lv(i,2) = v(i,2);
+        }
+      });
+    }
+    atomKK->modified(execution_space,V_MASK);
+    k_lv.template modify<DeviceType>();
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -129,10 +303,10 @@ void FixLangevinKokkos<DeviceType>::grow_arrays(int nmax)
 {
   memoryKK->grow_kokkos(k_franprev,franprev,nmax,3,"langevin:franprev");
   d_franprev = k_franprev.template view<DeviceType>();
-  h_franprev = k_franprev.template view<LMPHostType>();
+  h_franprev = k_franprev.h_view;
   memoryKK->grow_kokkos(k_lv,lv,nmax,3,"langevin:lv");
   d_lv = k_lv.template view<DeviceType>();
-  h_lv = k_lv.template view<LMPHostType>();
+  h_lv = k_lv.h_view;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -141,7 +315,6 @@ template<class DeviceType>
 void FixLangevinKokkos<DeviceType>::initial_integrate(int /*vflag*/)
 {
   atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);
 
   v = atomKK->k_v.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
@@ -150,6 +323,8 @@ void FixLangevinKokkos<DeviceType>::initial_integrate(int /*vflag*/)
 
   FixLangevinKokkosInitialIntegrateFunctor<DeviceType> functor(this);
   Kokkos::parallel_for(nlocal,functor);
+
+  atomKK->modified(execution_space,datamask_modify);
 }
 
 template<class DeviceType>
@@ -184,6 +359,7 @@ void FixLangevinKokkos<DeviceType>::post_force(int /*vflag*/)
   rmass = atomKK->k_rmass.view<DeviceType>();
   f = atomKK->k_f.template view<DeviceType>();
   v = atomKK->k_v.template view<DeviceType>();
+  mass = atomKK->k_mass.template view<DeviceType>();
   type = atomKK->k_type.template view<DeviceType>();
   mask = atomKK->k_mask.template view<DeviceType>();
 
@@ -197,7 +373,8 @@ void FixLangevinKokkos<DeviceType>::post_force(int /*vflag*/)
   dt = update->dt;
   mvv2e = force->mvv2e;
   ftm2v = force->ftm2v;
-  fran_prop_const = sqrt(24.0*boltz/t_period/dt/mvv2e);
+  fran_prop_const = sqrt(2.0*boltz/t_period/dt/mvv2e);
+  fran_prop_const_gjf = sqrt(24.0*boltz/t_period/dt/mvv2e);
 
   compute_target(); // modifies tforce vector, hence sync here
   k_tforce.template sync<DeviceType>();
@@ -220,7 +397,7 @@ void FixLangevinKokkos<DeviceType>::post_force(int /*vflag*/)
       maxatom1 = atomKK->nmax;
       memoryKK->create_kokkos(k_flangevin,flangevin,maxatom1,3,"langevin:flangevin");
       d_flangevin = k_flangevin.template view<DeviceType>();
-      h_flangevin = k_flangevin.template view<LMPHostType>();
+      h_flangevin = k_flangevin.h_view;
     }
   }
 
@@ -550,7 +727,7 @@ void FixLangevinKokkos<DeviceType>::post_force(int /*vflag*/)
     h_fsumall(0) = fsumall[0]/count;
     h_fsumall(1) = fsumall[1]/count;
     h_fsumall(2) = fsumall[2]/count;
-    k_fsumall.template modify<LMPHostType>();
+    k_fsumall.modify_host();
     k_fsumall.template sync<DeviceType>();
     // set total force zero in parallel on the device
     FixLangevinKokkosZeroForceFunctor<DeviceType> zero_functor(this);
@@ -581,20 +758,30 @@ FSUM FixLangevinKokkos<DeviceType>::post_force_item(int i) const
 
   if (mask[i] & groupbit) {
     rand_type rand_gen = rand_pool.get_state();
+
     if (Tp_TSTYLEATOM) tsqrt_t = sqrt(d_tforce[i]);
     if (Tp_RMASS) {
       gamma1 = -rmass[i] / t_period / ftm2v;
-      gamma2 = sqrt(rmass[i]) * fran_prop_const / ftm2v;
-      gamma1 *= 1.0/d_ratio[type[i]];
+      if (Tp_GJF)
+        gamma2 = sqrt(rmass[i]) * fran_prop_const_gjf / ftm2v;
+      else
+        gamma2 = sqrt(rmass[i]) * fran_prop_const / ftm2v;
+      gamma1 *= 1.0/ratio[type[i]];
       gamma2 *= 1.0/sqrt(d_ratio[type[i]]) * tsqrt_t;
     } else {
       gamma1 = d_gfactor1[type[i]];
       gamma2 = d_gfactor2[type[i]] * tsqrt_t;
     }
 
-    fran[0] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
-    fran[1] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
-    fran[2] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
+    if (Tp_GJF) {
+      fran[0] = gamma2 * rand_gen.normal(); //random->gaussian()
+      fran[1] = gamma2 * rand_gen.normal(); //random->gaussian()
+      fran[2] = gamma2 * rand_gen.normal(); //random->gaussian()
+    } else {
+      fran[0] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
+      fran[1] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
+      fran[2] = gamma2 * (rand_gen.drand() - 0.5); //(random->uniform()-0.5);
+    }
 
     if (Tp_BIAS) {
       fdrag[0] = gamma1*v(i,0);
@@ -678,7 +865,6 @@ void FixLangevinKokkos<DeviceType>::zero_force_item(int i) const
     f(i,1) -= d_fsumall[1];
     f(i,2) -= d_fsumall[2];
   }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -740,7 +926,7 @@ void FixLangevinKokkos<DeviceType>::reset_dt()
         force->ftm2v;
       h_gfactor2[i] *= 1.0/sqrt(h_ratio[i]);
     }
-    k_gfactor2.template modify<LMPHostType>();
+    k_gfactor2.modify_host();
   }
 
 }
@@ -781,9 +967,15 @@ KOKKOS_INLINE_FUNCTION
 double FixLangevinKokkos<DeviceType>::compute_energy_item(int i) const
 {
   double my_energy = 0.0;
-  if (mask[i] & groupbit)
-    my_energy = d_flangevin(i,0)*v(i,0) + d_flangevin(i,1)*v(i,1) +
-      d_flangevin(i,2)*v(i,2);
+  if (mask[i] & groupbit) {
+    if (gjfflag) {
+      my_energy = d_flangevin(i,0)*d_lv(i,0) + d_flangevin(i,1)*d_lv(i,1) +
+        d_flangevin(i,2)*d_lv(i,2);
+    } else {
+      my_energy = d_flangevin(i,0)*v(i,0) + d_flangevin(i,1)*v(i,1) +
+        d_flangevin(i,2)*v(i,2);
+    }
+  }
   return my_energy;
 }
 
@@ -796,30 +988,42 @@ void FixLangevinKokkos<DeviceType>::end_of_step()
 {
   if (!tallyflag && !gjfflag) return;
 
+  dt = update->dt;
+  ftm2v = force->ftm2v;
   v = atomKK->k_v.template view<DeviceType>();
-  f = atomKK->k_f.template view<DeviceType>();
+  rmass = atomKK->k_rmass.template view<DeviceType>();
+  mass = atomKK->k_mass.template view<DeviceType>();
   mask = atomKK->k_mask.template view<DeviceType>();
-
-  atomKK->sync(execution_space,V_MASK | MASK_MASK);
   int nlocal = atomKK->nlocal;
 
   energy_onestep = 0.0;
 
+  atomKK->sync(execution_space,V_MASK | MASK_MASK);
+  if (gjfflag) k_lv.template sync<DeviceType>();
   k_flangevin.template sync<DeviceType>();
-  FixLangevinKokkosTallyEnergyFunctor<DeviceType> tally_functor(this);
-  Kokkos::parallel_reduce(nlocal,tally_functor,energy_onestep);
+
+  if (tallyflag) {
+    FixLangevinKokkosTallyEnergyFunctor<DeviceType> tally_functor(this);
+    Kokkos::parallel_reduce(nlocal,tally_functor,energy_onestep);
+  }
 
   if (gjfflag) {
     if (rmass.data()) {
+      atomKK->sync(execution_space,RMASS_MASK);
       FixLangevinKokkosEndOfStepFunctor<DeviceType,1> functor(this);
       Kokkos::parallel_for(nlocal,functor);
     } else {
+      atomKK->sync(execution_space,TYPE_MASK);
+      type = atomKK->k_type.template view<DeviceType>();
       mass = atomKK->k_mass.view<DeviceType>();
       FixLangevinKokkosEndOfStepFunctor<DeviceType,0> functor(this);
       Kokkos::parallel_for(nlocal,functor);
     }
   }
 
+  atomKK->modified(execution_space,V_MASK);
+  k_lv.template modify<DeviceType>();
+
   energy += energy_onestep*update->dt;
 }
 
@@ -828,7 +1032,7 @@ KOKKOS_INLINE_FUNCTION
 void FixLangevinKokkos<DeviceType>::end_of_step_item(int i) const {
   double tmp[3];
   if (mask[i] & groupbit) {
-    const double dtfm = force->ftm2v * 0.5 * dt / mass[type[i]];
+    const double dtfm = ftm2v * 0.5 * dt / mass[type[i]];
     tmp[0] = v(i,0);
     tmp[1] = v(i,1);
     tmp[2] = v(i,2);
@@ -841,10 +1045,10 @@ void FixLangevinKokkos<DeviceType>::end_of_step_item(int i) const {
                 dtfm * 0.5 * (gjfsib * d_flangevin(i,0) - d_franprev(i,0)) +
                 (gjfsib * gjfa * 0.5 + dt * 0.25 / t_period / gjfsib) * d_lv(i,0);
       v(i,1) = 0.5 * gjfsib * gjfsib * (v(i,1) + dtfm * f(i,1) / gjfa) +
-                dtfm * 0.5 * (gjfsib * d_flangevin(i,0) - d_franprev(i,1)) +
+                dtfm * 0.5 * (gjfsib * d_flangevin(i,1) - d_franprev(i,1)) +
                 (gjfsib * gjfa * 0.5 + dt * 0.25 / t_period / gjfsib) * d_lv(i,1);
       v(i,2) = 0.5 * gjfsib * gjfsib * (v(i,2) + dtfm * f(i,2) / gjfa) +
-                dtfm * 0.5 * (gjfsib * d_flangevin(i,0) - d_franprev(i,2)) +
+                dtfm * 0.5 * (gjfsib * d_flangevin(i,2) - d_franprev(i,2)) +
                 (gjfsib * gjfa * 0.5 + dt * 0.25 / t_period / gjfsib) * d_lv(i,2);
     }
     d_lv(i,0) = tmp[0];
@@ -859,7 +1063,7 @@ void FixLangevinKokkos<DeviceType>::end_of_step_rmass_item(int i) const
 {
   double tmp[3];
   if (mask[i] & groupbit) {
-    const double dtfm = force->ftm2v * 0.5 * dt / rmass[i];
+    const double dtfm = ftm2v * 0.5 * dt / rmass[i];
     tmp[0] = v(i,0);
     tmp[1] = v(i,1);
     tmp[2] = v(i,2);
@@ -891,6 +1095,9 @@ void FixLangevinKokkos<DeviceType>::end_of_step_rmass_item(int i) const
 template<class DeviceType>
 void FixLangevinKokkos<DeviceType>::copy_arrays(int i, int j, int /*delflag*/)
 {
+  k_franprev.sync_host();
+  k_lv.sync_host();
+
   h_franprev(j,0) = h_franprev(i,0);
   h_franprev(j,1) = h_franprev(i,1);
   h_franprev(j,2) = h_franprev(i,2);
@@ -898,8 +1105,8 @@ void FixLangevinKokkos<DeviceType>::copy_arrays(int i, int j, int /*delflag*/)
   h_lv(j,1) = h_lv(i,1);
   h_lv(j,2) = h_lv(i,2);
 
-  k_franprev.template modify<LMPHostType>();
-  k_lv.template modify<LMPHostType>();
+  k_franprev.modify_host();
+  k_lv.modify_host();
 
 }
 
@@ -924,24 +1131,6 @@ void FixLangevinKokkos<DeviceType>::sort_kokkos(Kokkos::BinSort<KeyViewType, Bin
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-void FixLangevinKokkos<DeviceType>::cleanup_copy()
-{
-  random = nullptr;
-  tstr = nullptr;
-  gfactor1 = nullptr;
-  gfactor2 = nullptr;
-  ratio = nullptr;
-  id_temp = nullptr;
-  flangevin = nullptr;
-  tforce = nullptr;
-  gjfflag = 0;
-  franprev = nullptr;
-  lv = nullptr;
-  id = style = nullptr;
-  vatom = nullptr;
-}
-
 namespace LAMMPS_NS {
 template class FixLangevinKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
diff --git a/src/KOKKOS/fix_langevin_kokkos.h b/src/KOKKOS/fix_langevin_kokkos.h
index fc25a0a748..c674060bf2 100644
--- a/src/KOKKOS/fix_langevin_kokkos.h
+++ b/src/KOKKOS/fix_langevin_kokkos.h
@@ -27,6 +27,7 @@ FixStyle(langevin/kk/host,FixLangevinKokkos<LMPHostType>);
 #include "kokkos_type.h"
 #include "kokkos_base.h"
 #include "Kokkos_Random.hpp"
+#include "rand_pool_wrap_kokkos.h"
 
 namespace LAMMPS_NS {
 
@@ -66,8 +67,8 @@ namespace LAMMPS_NS {
     FixLangevinKokkos(class LAMMPS *, int, char **);
     ~FixLangevinKokkos() override;
 
-    void cleanup_copy();
     void init() override;
+    void setup(int) override;
     void initial_integrate(int) override;
     void fused_integrate(int) override;
     void post_force(int) override;
@@ -135,13 +136,21 @@ namespace LAMMPS_NS {
     typename tdual_double_1d_3n::t_dev d_fsumall;
     typename tdual_double_1d_3n::t_host h_fsumall;
 
-    double boltz,dt,mvv2e,ftm2v,fran_prop_const;
+    double boltz,dt,mvv2e,ftm2v,fran_prop_const,fran_prop_const_gjf;
 
     void compute_target();
 
+#ifndef LMP_KOKKOS_DEBUG_RNG
     Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
     typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
 
+    //Kokkos::Random_XorShift1024_Pool<DeviceType> rand_pool;
+    //typedef typename Kokkos::Random_XorShift1024_Pool<DeviceType>::generator_type rand_type;
+#else
+    RandPoolWrap rand_pool;
+    typedef RandWrap rand_type;
+#endif
+
   };
 
   template <class DeviceType>
@@ -150,7 +159,7 @@ namespace LAMMPS_NS {
     FixLangevinKokkos<DeviceType> c;
 
   FixLangevinKokkosInitialIntegrateFunctor(FixLangevinKokkos<DeviceType>* c_ptr):
-    c(*c_ptr) {c.cleanup_copy();};
+    c(*c_ptr) {c.set_copymode(1);};
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const int i) const {
@@ -168,7 +177,7 @@ namespace LAMMPS_NS {
 
     FixLangevinKokkosPostForceFunctor(FixLangevinKokkos<DeviceType>* c_ptr):
       c(*c_ptr) {}
-      ~FixLangevinKokkosPostForceFunctor() {c.cleanup_copy();}
+      ~FixLangevinKokkosPostForceFunctor() {c.set_copymode(1);}
 
       KOKKOS_INLINE_FUNCTION
       void operator()(const int i) const {
@@ -204,7 +213,7 @@ namespace LAMMPS_NS {
       FixLangevinKokkos<DeviceType> c;
 
     FixLangevinKokkosZeroForceFunctor(FixLangevinKokkos<DeviceType>* c_ptr):
-      c(*c_ptr) {c.cleanup_copy();}
+      c(*c_ptr) {c.set_copymode(1);}
 
       KOKKOS_INLINE_FUNCTION
       void operator()(const int i) const {
@@ -218,7 +227,7 @@ namespace LAMMPS_NS {
       FixLangevinKokkos<DeviceType> c;
       typedef double value_type;
     FixLangevinKokkosTallyEnergyFunctor(FixLangevinKokkos<DeviceType>* c_ptr):
-      c(*c_ptr) {c.cleanup_copy();}
+      c(*c_ptr) {c.set_copymode(1);}
 
       KOKKOS_INLINE_FUNCTION
       void operator()(const int i, value_type &energy) const {
@@ -241,7 +250,7 @@ namespace LAMMPS_NS {
     FixLangevinKokkos<DeviceType> c;
 
     FixLangevinKokkosEndOfStepFunctor(FixLangevinKokkos<DeviceType>* c_ptr):
-      c(*c_ptr) {c.cleanup_copy();}
+      c(*c_ptr) {c.set_copymode(1);}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const int i) const {
diff --git a/src/KOKKOS/fix_momentum_kokkos.cpp b/src/KOKKOS/fix_momentum_kokkos.cpp
index b41a3530cb..a363e2b1e7 100644
--- a/src/KOKKOS/fix_momentum_kokkos.cpp
+++ b/src/KOKKOS/fix_momentum_kokkos.cpp
@@ -36,7 +36,7 @@ FixMomentumKokkos<DeviceType>::FixMomentumKokkos(LAMMPS *lmp, int narg, char **a
 {
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
-  groupKK = (GroupKokkos<DeviceType> *)group;
+  groupKK = (GroupKokkos *)group;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
@@ -94,7 +94,7 @@ void FixMomentumKokkos<DeviceType>::end_of_step()
   double ekin_old,ekin_new;
   ekin_old = ekin_new = 0.0;
 
-  if (dynamic) masstotal = groupKK->mass(igroup);
+  if (dynamic) masstotal = groupKK->mass_kk<DeviceType>(igroup);
 
   // do nothing if group is empty, i.e. mass is zero;
 
@@ -109,7 +109,7 @@ void FixMomentumKokkos<DeviceType>::end_of_step()
   auto groupbit2 = groupbit;
   if (linear) {
     double vcm[3];
-    groupKK->vcm(igroup,masstotal,vcm);
+    groupKK->vcm_kk<DeviceType>(igroup,masstotal,vcm);
 
     // adjust velocities by vcm to zero linear momentum
     // only adjust a component if flag is set
@@ -131,9 +131,9 @@ void FixMomentumKokkos<DeviceType>::end_of_step()
 
   if (angular) {
     double xcm[3],angmom[3],omega[3],inertia[3][3];
-    groupKK->xcm(igroup,masstotal,xcm);
-    groupKK->angmom(igroup,xcm,angmom);
-    groupKK->inertia(igroup,xcm,inertia);
+    groupKK->xcm_kk<DeviceType>(igroup,masstotal,xcm);
+    groupKK->angmom_kk<DeviceType>(igroup,xcm,angmom);
+    groupKK->inertia_kk<DeviceType>(igroup,xcm,inertia);
     group->omega(angmom,inertia,omega);
 
     // adjust velocities to zero omega
diff --git a/src/KOKKOS/fix_momentum_kokkos.h b/src/KOKKOS/fix_momentum_kokkos.h
index 0ab91c423d..5ea474a069 100644
--- a/src/KOKKOS/fix_momentum_kokkos.h
+++ b/src/KOKKOS/fix_momentum_kokkos.h
@@ -38,7 +38,7 @@ class FixMomentumKokkos : public FixMomentum {
   FixMomentumKokkos(class LAMMPS *, int, char **);
   void end_of_step() override;
  private:
-    GroupKokkos<DeviceType> *groupKK;
+    GroupKokkos *groupKK;
 };
 
 }
diff --git a/src/KOKKOS/fix_nve_limit_kokkos.cpp b/src/KOKKOS/fix_nve_limit_kokkos.cpp
index de77427e49..9cc8fb22b5 100644
--- a/src/KOKKOS/fix_nve_limit_kokkos.cpp
+++ b/src/KOKKOS/fix_nve_limit_kokkos.cpp
@@ -66,7 +66,7 @@ void FixNVELimitKokkos<DeviceType>::initial_integrate(int /*vflag*/)
     auto d_type = atomKK->k_type.template view<DeviceType>();
     atomKK->sync(execution_space, X_MASK|V_MASK|F_MASK|MASK_MASK|RMASS_MASK );
 
-    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int i, int &l_ncount) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int i, int &l_ncount) {
       if (d_mask[i] & l_groupbit) {
         const double dtfm = l_dtf / d_rmass[i];
         d_v(i,0) += dtfm * d_f(i,0);
@@ -95,7 +95,7 @@ void FixNVELimitKokkos<DeviceType>::initial_integrate(int /*vflag*/)
     auto l_groupbit = groupbit;
     atomKK->sync(execution_space, X_MASK|V_MASK|F_MASK|MASK_MASK|TYPE_MASK );
 
-    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int i, int &l_ncount) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int i, int &l_ncount) {
       if (d_mask[i] & l_groupbit) {
         const double dtfm = l_dtf / d_mass[d_type[i]];
         d_v(i,0) += dtfm * d_f(i,0);
@@ -144,7 +144,7 @@ void FixNVELimitKokkos<DeviceType>::final_integrate()
     auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
     atomKK->sync(execution_space, V_MASK|F_MASK|MASK_MASK|RMASS_MASK );
 
-    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int i, int &l_ncount) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int i, int &l_ncount) {
       if (d_mask[i] & l_groupbit) {
         const double dtfm = l_dtf / d_rmass[i];
         d_v(i,0) += dtfm * d_f(i,0);
@@ -168,7 +168,7 @@ void FixNVELimitKokkos<DeviceType>::final_integrate()
     auto d_type = atomKK->k_type.template view<DeviceType>();
     atomKK->sync(execution_space, V_MASK|F_MASK|MASK_MASK|TYPE_MASK );
 
-    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int i, int &l_ncount) {
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal), KOKKOS_LAMBDA(const int i, int &l_ncount) {
       if (d_mask[i] & l_groupbit) {
         const double dtfm = l_dtf / d_mass[d_type[i]];
         d_v(i,0) += dtfm * d_f(i,0);
diff --git a/src/KOKKOS/fix_recenter_kokkos.cpp b/src/KOKKOS/fix_recenter_kokkos.cpp
index 607f5ce8d9..3f67e0f65d 100644
--- a/src/KOKKOS/fix_recenter_kokkos.cpp
+++ b/src/KOKKOS/fix_recenter_kokkos.cpp
@@ -38,7 +38,7 @@ FixRecenterKokkos<DeviceType>::FixRecenterKokkos(LAMMPS *lmp, int narg, char **a
 {
   kokkosable = 1;
   atomKK = (AtomKokkos *)atom;
-  groupKK = (GroupKokkos<DeviceType> *)group;
+  groupKK = (GroupKokkos *)group;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 
   datamask_read = X_MASK | MASK_MASK;
@@ -87,9 +87,10 @@ void FixRecenterKokkos<DeviceType>::initial_integrate(int /*vflag*/)
 
   // current COM
 
-  if (group->dynamic[igroup]) masstotal = groupKK->mass(igroup);
+
+  if (group->dynamic[igroup]) masstotal = groupKK->mass_kk<DeviceType>(igroup);
   double xcm[3];
-  groupKK->xcm(igroup,masstotal,xcm);
+  groupKK->xcm_kk<DeviceType>(igroup,masstotal,xcm);
 
   // shift coords by difference between actual COM and requested COM
 
diff --git a/src/KOKKOS/fix_recenter_kokkos.h b/src/KOKKOS/fix_recenter_kokkos.h
index 36e154e05c..46b4d3df7e 100644
--- a/src/KOKKOS/fix_recenter_kokkos.h
+++ b/src/KOKKOS/fix_recenter_kokkos.h
@@ -36,7 +36,7 @@ class FixRecenterKokkos : public FixRecenter {
     FixRecenterKokkos(class LAMMPS *, int, char **);
     void initial_integrate(int) override;
   private:
-    GroupKokkos<DeviceType> *groupKK;
+    GroupKokkos *groupKK;
 };
 
 } // namespace LAMMPS_NS
diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
index 0d1c250b3d..f785eb10e4 100644
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -1859,7 +1859,7 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   // loop over neighbors of my atoms
 #if 0
-  Kokkos::parallel_for ( inum,
+  Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType>(0,inum),
         LAMMPS_LAMBDA(const int ii)
         {
           // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
@@ -1939,7 +1939,7 @@ void FixRxKokkos<DeviceType>::computeLocalTemperature()
 
   // self-interaction for local temperature
 #if 0
-  Kokkos::parallel_for ( nlocal,
+  Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType>(0,nlocal),
         LAMMPS_LAMBDA(const int i)
         {
           double wij = 0.0;
diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
index a64adbcc38..04a3a45f68 100644
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -283,22 +283,22 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 #ifdef DEBUG_SSA_PAIR_CT
-      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
-      else Kokkos::atomic_increment(&(d_counters(0, 1)));
-      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_inc(&(d_counters(0, 0)));
+      else Kokkos::atomic_inc(&(d_counters(0, 1)));
+      Kokkos::atomic_inc(&(d_counters(0, 2)));
       int rsqi = rsq / 8;
       if (rsqi < 0) rsqi = 0;
       else if (rsqi > 31) rsqi = 31;
-      Kokkos::atomic_increment(&(d_hist(rsqi)));
+      Kokkos::atomic_inc(&(d_hist(rsqi)));
 #endif
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
 #ifdef DEBUG_SSA_PAIR_CT
-        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
-        else Kokkos::atomic_increment(&(d_counters(1, 1)));
-        Kokkos::atomic_increment(&(d_counters(1, 2)));
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_inc(&(d_counters(1, 0)));
+        else Kokkos::atomic_inc(&(d_counters(1, 1)));
+        Kokkos::atomic_inc(&(d_counters(1, 2)));
 #endif
         double r = sqrt(rsq);
         double rinv = 1.0/r;
@@ -428,22 +428,22 @@ void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
       const X_FLOAT delz = ztmp - x(j, 2);
       const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 #ifdef DEBUG_SSA_PAIR_CT
-      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
-      else Kokkos::atomic_increment(&(d_counters(0, 1)));
-      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_inc(&(d_counters(0, 0)));
+      else Kokkos::atomic_inc(&(d_counters(0, 1)));
+      Kokkos::atomic_inc(&(d_counters(0, 2)));
       int rsqi = rsq / 8;
       if (rsqi < 0) rsqi = 0;
       else if (rsqi > 31) rsqi = 31;
-      Kokkos::atomic_increment(&(d_hist(rsqi)));
+      Kokkos::atomic_inc(&(d_hist(rsqi)));
 #endif
 
       // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
       if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
         && (rsq >= EPSILON_SQUARED)) {
 #ifdef DEBUG_SSA_PAIR_CT
-        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
-        else Kokkos::atomic_increment(&(d_counters(1, 1)));
-        Kokkos::atomic_increment(&(d_counters(1, 2)));
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_inc(&(d_counters(1, 0)));
+        else Kokkos::atomic_inc(&(d_counters(1, 1)));
+        Kokkos::atomic_inc(&(d_counters(1, 2)));
 #endif
 
         double r = sqrt(rsq);
diff --git a/src/KOKKOS/fix_spring_self_kokkos.cpp b/src/KOKKOS/fix_spring_self_kokkos.cpp
index 1b6d45ead7..59b9a49ee8 100644
--- a/src/KOKKOS/fix_spring_self_kokkos.cpp
+++ b/src/KOKKOS/fix_spring_self_kokkos.cpp
@@ -123,7 +123,7 @@ void FixSpringSelfKokkos<DeviceType>::post_force(int /*vflag*/)
   auto l_yflag = yflag;
   auto l_zflag = zflag;
 
-  Kokkos::parallel_reduce(nlocal, LAMMPS_LAMBDA(const int& i, double& espring_kk) {
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal), LAMMPS_LAMBDA(const int& i, double& espring_kk) {
     if (l_mask[i] & l_groupbit) {
       Few<double,3> x_i;
       x_i[0] = l_x(i,0);
diff --git a/src/KOKKOS/group_kokkos.cpp b/src/KOKKOS/group_kokkos.cpp
deleted file mode 100644
index b2de2e6a64..0000000000
--- a/src/KOKKOS/group_kokkos.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Mitch Murphy (alphataubio at gmail)
-------------------------------------------------------------------------- */
-
-#include "group_kokkos.h"
-
-#include "atom_kokkos.h"
-#include "atom_masks.h"
-#include "domain_kokkos.h"
-#include "kokkos_few.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-GroupKokkos<DeviceType>::GroupKokkos(LAMMPS *lmp) : Group(lmp)
-{
-  atomKK = (AtomKokkos *)atom;
-  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-}
-
-// ----------------------------------------------------------------------
-// computations on a group of atoms
-// ----------------------------------------------------------------------
-
-/* ----------------------------------------------------------------------
-   compute the total mass of group of atoms
-   use either per-type mass or per-atom rmass
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-double GroupKokkos<DeviceType>::mass(int igroup)
-{
-  int groupbit = bitmask[igroup];
-  auto d_mask = atomKK->k_mask.template view<DeviceType>();
-  double one = 0.0;
-
-  if (atomKK->rmass) {
-
-    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
-    atomKK->sync(execution_space,MASK_MASK|RMASS_MASK);
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_one) {
-      if (d_mask(i) & groupbit) l_one += d_rmass(i);
-    }, one);
-
-  } else {
-
-    auto d_mass = atomKK->k_mass.template view<DeviceType>();
-    auto d_type = atomKK->k_type.template view<DeviceType>();
-    atomKK->sync(execution_space,MASK_MASK|TYPE_MASK);
-    atomKK->k_mass.template sync<DeviceType>();
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_one) {
-      if (d_mask(i) & groupbit) l_one += d_mass(d_type(i));
-    }, one);
-
-  }
-
-  double all;
-  MPI_Allreduce(&one, &all, 1, MPI_DOUBLE, MPI_SUM, world);
-  return all;
-}
-
-/* ----------------------------------------------------------------------
-   compute the center-of-mass coords of group of atoms
-   masstotal = total mass
-   return center-of-mass coords in cm[]
-   must unwrap atoms to compute center-of-mass correctly
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void GroupKokkos<DeviceType>::xcm(int igroup, double masstotal, double *xcm)
-{
-  int groupbit = bitmask[igroup];
-  auto d_x = atomKK->k_x.template view<DeviceType>();
-  auto d_mask = atomKK->k_mask.template view<DeviceType>();
-  auto d_image = atomKK->k_image.template view<DeviceType>();
-  auto l_prd = Few<double, 3>(domain->prd);
-  auto l_h = Few<double, 6>(domain->h);
-  auto l_triclinic = domain->triclinic;
-  double cmone[3] = {0.0, 0.0, 0.0};
-
-  if (atomKK->rmass) {
-
-    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_cmx, double &l_cmy, double &l_cmz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_rmass(i);
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        l_cmx += unwrapKK[0] * massone;
-        l_cmy += unwrapKK[1] * massone;
-        l_cmz += unwrapKK[2] * massone;
-      }
-    }, cmone[0], cmone[1], cmone[2]);
-
-  } else {
-
-    auto d_mass = atomKK->k_mass.template view<DeviceType>();
-    auto d_type = atomKK->k_type.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
-    atomKK->k_mass.template sync<DeviceType>();
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_cmx, double &l_cmy, double &l_cmz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_mass(d_type(i));
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        l_cmx += unwrapKK[0] * massone;
-        l_cmy += unwrapKK[1] * massone;
-        l_cmz += unwrapKK[2] * massone;
-      }
-    }, cmone[0], cmone[1], cmone[2]);
-
-  }
-
-  MPI_Allreduce(cmone, xcm, 3, MPI_DOUBLE, MPI_SUM, world);
-  if (masstotal > 0.0) {
-    xcm[0] /= masstotal;
-    xcm[1] /= masstotal;
-    xcm[2] /= masstotal;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   compute the center-of-mass velocity of group of atoms
-   masstotal = total mass
-   return center-of-mass velocity in vcm[]
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void GroupKokkos<DeviceType>::vcm(int igroup, double masstotal, double *vcm)
-{
-  int groupbit = bitmask[igroup];
-  auto d_v = atomKK->k_v.template view<DeviceType>();
-  auto d_mask = atomKK->k_mask.template view<DeviceType>();
-  auto d_image = atomKK->k_image.template view<DeviceType>();
-  double p[3] = {0.0, 0.0, 0.0};
-
-  if (atomKK->rmass) {
-
-    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
-    atomKK->sync(execution_space,V_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_rmass(i);
-        l_px += d_v(i,0) * massone;
-        l_py += d_v(i,1) * massone;
-        l_pz += d_v(i,2) * massone;
-      }
-    }, p[0], p[1], p[2]);
-
-  } else {
-
-    auto d_mass = atomKK->k_mass.template view<DeviceType>();
-    auto d_type = atomKK->k_type.template view<DeviceType>();
-    atomKK->sync(execution_space,V_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
-    atomKK->k_mass.template sync<DeviceType>();
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_mass(d_type(i));
-        l_px += d_v(i,0) * massone;
-        l_py += d_v(i,1) * massone;
-        l_pz += d_v(i,2) * massone;
-      }
-    }, p[0], p[1], p[2]);
-
-  }
-
-  MPI_Allreduce(p, vcm, 3, MPI_DOUBLE, MPI_SUM, world);
-  if (masstotal > 0.0) {
-    vcm[0] /= masstotal;
-    vcm[1] /= masstotal;
-    vcm[2] /= masstotal;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   compute the angular momentum L (lmom) of group
-   around center-of-mass cm
-   must unwrap atoms to compute L correctly
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void GroupKokkos<DeviceType>::angmom(int igroup, double *xcm, double *lmom)
-{
-  int groupbit = bitmask[igroup];
-  auto d_x = atomKK->k_x.template view<DeviceType>();
-  auto d_v = atomKK->k_v.template view<DeviceType>();
-  auto d_mask = atomKK->k_mask.template view<DeviceType>();
-  auto d_image = atomKK->k_image.template view<DeviceType>();
-  auto l_prd = Few<double, 3>(domain->prd);
-  auto l_h = Few<double, 6>(domain->h);
-  auto l_triclinic = domain->triclinic;
-  auto l_xcm0 = xcm[0];
-  auto l_xcm1 = xcm[1];
-  auto l_xcm2 = xcm[2];
-  double p[3] = {0.0, 0.0, 0.0};
-
-  if (atomKK->rmass) {
-
-    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_rmass(i);
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        double dx = unwrapKK[0] - l_xcm0;
-        double dy = unwrapKK[1] - l_xcm1;
-        double dz = unwrapKK[2] - l_xcm2;
-        l_px += massone * (dy * d_v(i,2) - dz * d_v(i,1));
-        l_py += massone * (dz * d_v(i,0) - dx * d_v(i,2));
-        l_pz += massone * (dx * d_v(i,1) - dy * d_v(i,0));
-      }
-    }, p[0], p[1], p[2]);
-
-  } else {
-
-    auto d_mass = atomKK->k_mass.template view<DeviceType>();
-    auto d_type = atomKK->k_type.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
-    atomKK->k_mass.template sync<DeviceType>();
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_mass(d_type(i));
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        double dx = unwrapKK[0] - l_xcm0;
-        double dy = unwrapKK[1] - l_xcm1;
-        double dz = unwrapKK[2] - l_xcm2;
-        l_px += massone * (dy * d_v(i,2) - dz * d_v(i,1));
-        l_py += massone * (dz * d_v(i,0) - dx * d_v(i,2));
-        l_pz += massone * (dx * d_v(i,1) - dy * d_v(i,0));
-      }
-    }, p[0], p[1], p[2]);
-
-  }
-  MPI_Allreduce(p, lmom, 3, MPI_DOUBLE, MPI_SUM, world);
-}
-
-/* ----------------------------------------------------------------------
-   compute moment of inertia tensor around center-of-mass xcm of group
-   must unwrap atoms to compute itensor correctly
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-void GroupKokkos<DeviceType>::inertia(int igroup, double *xcm, double itensor[3][3])
-{
-  int groupbit = bitmask[igroup];
-  auto d_x = atomKK->k_x.template view<DeviceType>();
-  auto d_mask = atomKK->k_mask.template view<DeviceType>();
-  auto d_image = atomKK->k_image.template view<DeviceType>();
-  auto l_prd = Few<double, 3>(domain->prd);
-  auto l_h = Few<double, 6>(domain->h);
-  auto l_triclinic = domain->triclinic;
-  auto l_xcm0 = xcm[0];
-  auto l_xcm1 = xcm[1];
-  auto l_xcm2 = xcm[2];
-
-  double ione[3][3];
-  for (int i = 0; i < 3; i++)
-    for (int j = 0; j < 3; j++) ione[i][j] = 0.0;
-
-  if (atomKK->rmass) {
-
-    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_i00, double &l_i11, double &l_i22, double &l_i01, double &l_i12, double &l_i02) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_rmass(i);
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        double dx = unwrapKK[0] - l_xcm0;
-        double dy = unwrapKK[1] - l_xcm1;
-        double dz = unwrapKK[2] - l_xcm2;
-        l_i00 += massone * (dy * dy + dz * dz);
-        l_i11 += massone * (dx * dx + dz * dz);
-        l_i22 += massone * (dx * dx + dy * dy);
-        l_i01 -= massone * dx * dy;
-        l_i12 -= massone * dy * dz;
-        l_i02 -= massone * dx * dz;
-      }
-    }, ione[0][0], ione[1][1], ione[2][2], ione[0][1], ione[1][2], ione[0][2]);
-
-  } else {
-
-    auto d_mass = atomKK->k_mass.template view<DeviceType>();
-    auto d_type = atomKK->k_type.template view<DeviceType>();
-    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
-    atomKK->k_mass.template sync<DeviceType>();
-
-    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int i, double &l_i00, double &l_i11, double &l_i22, double &l_i01, double &l_i12, double &l_i02) {
-      if (d_mask(i) & groupbit) {
-        double massone = d_mass(d_type(i));
-        Few<double,3> x_i;
-        x_i[0] = d_x(i,0);
-        x_i[1] = d_x(i,1);
-        x_i[2] = d_x(i,2);
-        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
-        double dx = unwrapKK[0] - l_xcm0;
-        double dy = unwrapKK[1] - l_xcm1;
-        double dz = unwrapKK[2] - l_xcm2;
-        l_i00 += massone * (dy * dy + dz * dz);
-        l_i11 += massone * (dx * dx + dz * dz);
-        l_i22 += massone * (dx * dx + dy * dy);
-        l_i01 -= massone * dx * dy;
-        l_i12 -= massone * dy * dz;
-        l_i02 -= massone * dx * dz;
-      }
-    }, ione[0][0], ione[1][1], ione[2][2], ione[0][1], ione[1][2], ione[0][2]);
-
-  }
-
-  ione[1][0] = ione[0][1];
-  ione[2][1] = ione[1][2];
-  ione[2][0] = ione[0][2];
-  MPI_Allreduce(&ione[0][0], &itensor[0][0], 9, MPI_DOUBLE, MPI_SUM, world);
-}
-
-namespace LAMMPS_NS {
-template class GroupKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class GroupKokkos<LMPHostType>;
-#endif
-}
diff --git a/src/KOKKOS/group_kokkos.h b/src/KOKKOS/group_kokkos.h
index f23023b17c..75c0601357 100644
--- a/src/KOKKOS/group_kokkos.h
+++ b/src/KOKKOS/group_kokkos.h
@@ -15,22 +15,352 @@
 #define LMP_GROUP_KOKKOS_H
 
 #include "group.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "domain_kokkos.h"
+#include "kokkos_few.h"
 #include "kokkos_type.h"
 
+
 namespace LAMMPS_NS {
 
-template<class DeviceType>
 class GroupKokkos : public Group {
  public:
-  GroupKokkos(class LAMMPS *);
-  double mass(int);                   // total mass of atoms in group
-  void xcm(int, double, double *);    // center-of-mass coords of group
-  void vcm(int, double, double *);    // center-of-mass velocity of group
-  void angmom(int, double *, double *);    // angular momentum of group
-  void inertia(int, double *, double[3][3]);    // inertia tensor
+  GroupKokkos(LAMMPS *lmp) : Group(lmp) { atomKK = (AtomKokkos *)atom; }
+
+// ----------------------------------------------------------------------
+// computations on a group of atoms
+// ----------------------------------------------------------------------
+
+/* ----------------------------------------------------------------------
+   compute the total mass of group of atoms
+   use either per-type mass or per-atom rmass
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double mass_kk(int igroup)
+{
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  int groupbit = bitmask[igroup];
+  auto d_mask = atomKK->k_mask.template view<DeviceType>();
+  double one = 0.0;
+
+  if (atomKK->rmass) {
+
+    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
+    atomKK->sync(execution_space,MASK_MASK|RMASS_MASK);
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_one) {
+      if (d_mask(i) & groupbit) l_one += d_rmass(i);
+    }, one);
+
+  } else {
+
+    auto d_mass = atomKK->k_mass.template view<DeviceType>();
+    auto d_type = atomKK->k_type.template view<DeviceType>();
+    atomKK->sync(execution_space,MASK_MASK|TYPE_MASK);
+    atomKK->k_mass.template sync<DeviceType>();
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_one) {
+      if (d_mask(i) & groupbit) l_one += d_mass(d_type(i));
+    }, one);
+
+  }
+
+  double all;
+  MPI_Allreduce(&one, &all, 1, MPI_DOUBLE, MPI_SUM, world);
+  return all;
+}
+
+/* ----------------------------------------------------------------------
+   compute the center-of-mass coords of group of atoms
+   masstotal = total mass
+   return center-of-mass coords in cm[]
+   must unwrap atoms to compute center-of-mass correctly
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void xcm_kk(int igroup, double masstotal, double *xcm)
+{
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  int groupbit = bitmask[igroup];
+  auto d_x = atomKK->k_x.template view<DeviceType>();
+  auto d_mask = atomKK->k_mask.template view<DeviceType>();
+  auto d_image = atomKK->k_image.template view<DeviceType>();
+  auto l_prd = Few<double, 3>(domain->prd);
+  auto l_h = Few<double, 6>(domain->h);
+  auto l_triclinic = domain->triclinic;
+  double cmone[3] = {0.0, 0.0, 0.0};
+
+  if (atomKK->rmass) {
+
+    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_cmx, double &l_cmy, double &l_cmz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_rmass(i);
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        l_cmx += unwrapKK[0] * massone;
+        l_cmy += unwrapKK[1] * massone;
+        l_cmz += unwrapKK[2] * massone;
+      }
+    }, cmone[0], cmone[1], cmone[2]);
+
+  } else {
+
+    auto d_mass = atomKK->k_mass.template view<DeviceType>();
+    auto d_type = atomKK->k_type.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
+    atomKK->k_mass.template sync<DeviceType>();
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_cmx, double &l_cmy, double &l_cmz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_mass(d_type(i));
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        l_cmx += unwrapKK[0] * massone;
+        l_cmy += unwrapKK[1] * massone;
+        l_cmz += unwrapKK[2] * massone;
+      }
+    }, cmone[0], cmone[1], cmone[2]);
+
+  }
+
+  MPI_Allreduce(cmone, xcm, 3, MPI_DOUBLE, MPI_SUM, world);
+  if (masstotal > 0.0) {
+    xcm[0] /= masstotal;
+    xcm[1] /= masstotal;
+    xcm[2] /= masstotal;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute the center-of-mass velocity of group of atoms
+   masstotal = total mass
+   return center-of-mass velocity in vcm[]
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void vcm_kk(int igroup, double masstotal, double *vcm)
+{
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  int groupbit = bitmask[igroup];
+  auto d_v = atomKK->k_v.template view<DeviceType>();
+  auto d_mask = atomKK->k_mask.template view<DeviceType>();
+  auto d_image = atomKK->k_image.template view<DeviceType>();
+  double p[3] = {0.0, 0.0, 0.0};
+
+  if (atomKK->rmass) {
+
+    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
+    atomKK->sync(execution_space,V_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_rmass(i);
+        l_px += d_v(i,0) * massone;
+        l_py += d_v(i,1) * massone;
+        l_pz += d_v(i,2) * massone;
+      }
+    }, p[0], p[1], p[2]);
+
+  } else {
+
+    auto d_mass = atomKK->k_mass.template view<DeviceType>();
+    auto d_type = atomKK->k_type.template view<DeviceType>();
+    atomKK->sync(execution_space,V_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
+    atomKK->k_mass.template sync<DeviceType>();
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_mass(d_type(i));
+        l_px += d_v(i,0) * massone;
+        l_py += d_v(i,1) * massone;
+        l_pz += d_v(i,2) * massone;
+      }
+    }, p[0], p[1], p[2]);
+
+  }
+
+  MPI_Allreduce(p, vcm, 3, MPI_DOUBLE, MPI_SUM, world);
+  if (masstotal > 0.0) {
+    vcm[0] /= masstotal;
+    vcm[1] /= masstotal;
+    vcm[2] /= masstotal;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute the angular momentum L (lmom) of group
+   around center-of-mass cm
+   must unwrap atoms to compute L correctly
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void angmom_kk(int igroup, double *xcm, double *lmom)
+{
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  int groupbit = bitmask[igroup];
+  auto d_x = atomKK->k_x.template view<DeviceType>();
+  auto d_v = atomKK->k_v.template view<DeviceType>();
+  auto d_mask = atomKK->k_mask.template view<DeviceType>();
+  auto d_image = atomKK->k_image.template view<DeviceType>();
+  auto l_prd = Few<double, 3>(domain->prd);
+  auto l_h = Few<double, 6>(domain->h);
+  auto l_triclinic = domain->triclinic;
+  auto l_xcm0 = xcm[0];
+  auto l_xcm1 = xcm[1];
+  auto l_xcm2 = xcm[2];
+  double p[3] = {0.0, 0.0, 0.0};
+
+  if (atomKK->rmass) {
+
+    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_rmass(i);
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        double dx = unwrapKK[0] - l_xcm0;
+        double dy = unwrapKK[1] - l_xcm1;
+        double dz = unwrapKK[2] - l_xcm2;
+        l_px += massone * (dy * d_v(i,2) - dz * d_v(i,1));
+        l_py += massone * (dz * d_v(i,0) - dx * d_v(i,2));
+        l_pz += massone * (dx * d_v(i,1) - dy * d_v(i,0));
+      }
+    }, p[0], p[1], p[2]);
+
+  } else {
+
+    auto d_mass = atomKK->k_mass.template view<DeviceType>();
+    auto d_type = atomKK->k_type.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
+    atomKK->k_mass.template sync<DeviceType>();
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_px, double &l_py, double &l_pz) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_mass(d_type(i));
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        double dx = unwrapKK[0] - l_xcm0;
+        double dy = unwrapKK[1] - l_xcm1;
+        double dz = unwrapKK[2] - l_xcm2;
+        l_px += massone * (dy * d_v(i,2) - dz * d_v(i,1));
+        l_py += massone * (dz * d_v(i,0) - dx * d_v(i,2));
+        l_pz += massone * (dx * d_v(i,1) - dy * d_v(i,0));
+      }
+    }, p[0], p[1], p[2]);
+
+  }
+  MPI_Allreduce(p, lmom, 3, MPI_DOUBLE, MPI_SUM, world);
+}
+
+/* ----------------------------------------------------------------------
+   compute moment of inertia tensor around center-of-mass xcm of group
+   must unwrap atoms to compute itensor correctly
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void inertia_kk(int igroup, double *xcm, double itensor[3][3])
+{
+  auto execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  int groupbit = bitmask[igroup];
+  auto d_x = atomKK->k_x.template view<DeviceType>();
+  auto d_mask = atomKK->k_mask.template view<DeviceType>();
+  auto d_image = atomKK->k_image.template view<DeviceType>();
+  auto l_prd = Few<double, 3>(domain->prd);
+  auto l_h = Few<double, 6>(domain->h);
+  auto l_triclinic = domain->triclinic;
+  auto l_xcm0 = xcm[0];
+  auto l_xcm1 = xcm[1];
+  auto l_xcm2 = xcm[2];
+
+  double ione[3][3];
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++) ione[i][j] = 0.0;
+
+  if (atomKK->rmass) {
+
+    auto d_rmass = atomKK->k_rmass.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|RMASS_MASK);
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_i00, double &l_i11, double &l_i22, double &l_i01, double &l_i12, double &l_i02) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_rmass(i);
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        double dx = unwrapKK[0] - l_xcm0;
+        double dy = unwrapKK[1] - l_xcm1;
+        double dz = unwrapKK[2] - l_xcm2;
+        l_i00 += massone * (dy * dy + dz * dz);
+        l_i11 += massone * (dx * dx + dz * dz);
+        l_i22 += massone * (dx * dx + dy * dy);
+        l_i01 -= massone * dx * dy;
+        l_i12 -= massone * dy * dz;
+        l_i02 -= massone * dx * dz;
+      }
+    }, ione[0][0], ione[1][1], ione[2][2], ione[0][1], ione[1][2], ione[0][2]);
+
+  } else {
+
+    auto d_mass = atomKK->k_mass.template view<DeviceType>();
+    auto d_type = atomKK->k_type.template view<DeviceType>();
+    atomKK->sync(execution_space,X_MASK|MASK_MASK|IMAGE_MASK|TYPE_MASK);
+    atomKK->k_mass.template sync<DeviceType>();
+
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,atom->nlocal), KOKKOS_LAMBDA(const int i, double &l_i00, double &l_i11, double &l_i22, double &l_i01, double &l_i12, double &l_i02) {
+      if (d_mask(i) & groupbit) {
+        double massone = d_mass(d_type(i));
+        Few<double,3> x_i;
+        x_i[0] = d_x(i,0);
+        x_i[1] = d_x(i,1);
+        x_i[2] = d_x(i,2);
+        auto unwrapKK = DomainKokkos::unmap(l_prd,l_h,l_triclinic,x_i,d_image(i));
+        double dx = unwrapKK[0] - l_xcm0;
+        double dy = unwrapKK[1] - l_xcm1;
+        double dz = unwrapKK[2] - l_xcm2;
+        l_i00 += massone * (dy * dy + dz * dz);
+        l_i11 += massone * (dx * dx + dz * dz);
+        l_i22 += massone * (dx * dx + dy * dy);
+        l_i01 -= massone * dx * dy;
+        l_i12 -= massone * dy * dz;
+        l_i02 -= massone * dx * dz;
+      }
+    }, ione[0][0], ione[1][1], ione[2][2], ione[0][1], ione[1][2], ione[0][2]);
+
+  }
+
+  ione[1][0] = ione[0][1];
+  ione[2][1] = ione[1][2];
+  ione[2][0] = ione[0][2];
+  MPI_Allreduce(&ione[0][0], &itensor[0][0], 9, MPI_DOUBLE, MPI_SUM, world);
+}
 
- private:
-  ExecutionSpace execution_space;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/KOKKOS/improper_harmonic_kokkos.cpp b/src/KOKKOS/improper_harmonic_kokkos.cpp
index eafa7a08ec..89ca31b9ca 100644
--- a/src/KOKKOS/improper_harmonic_kokkos.cpp
+++ b/src/KOKKOS/improper_harmonic_kokkos.cpp
@@ -74,14 +74,14 @@ void ImproperHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // reallocate per-atom arrays if necessary
 
   if (eflag_atom) {
-    if(k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
       memoryKK->destroy_kokkos(k_eatom,eatom);
       memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"improper:eatom");
       d_eatom = k_eatom.template view<KKDeviceType>();
     } else Kokkos::deep_copy(d_eatom,0.0);
   }
   if (vflag_atom) {
-    if(k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
       memoryKK->destroy_kokkos(k_vatom,vatom);
       memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"improper:vatom");
       d_vatom = k_vatom.template view<KKDeviceType>();
diff --git a/src/KOKKOS/improper_hybrid_kokkos.cpp b/src/KOKKOS/improper_hybrid_kokkos.cpp
index bfa55978cc..885405187b 100644
--- a/src/KOKKOS/improper_hybrid_kokkos.cpp
+++ b/src/KOKKOS/improper_hybrid_kokkos.cpp
@@ -77,7 +77,7 @@ void ImproperHybridKokkos::compute(int eflag, int vflag)
 
     Kokkos::parallel_for(nimproperlist_orig,LAMMPS_LAMBDA(int i) {
       const int m = d_map[d_improperlist_orig(i,4)];
-      if (m >= 0) Kokkos::atomic_increment(&d_nimproperlist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_nimproperlist[m]);
     });
 
     k_nimproperlist.modify_device();
@@ -88,7 +88,7 @@ void ImproperHybridKokkos::compute(int eflag, int vflag)
       if (h_nimproperlist[m] > maximproper_all)
         maximproper_all = h_nimproperlist[m] + EXTRA;
 
-    if (k_improperlist.d_view.extent(1) < maximproper_all)
+    if ((int)k_improperlist.d_view.extent(1) < maximproper_all)
       MemKK::realloc_kokkos(k_improperlist, "improper_hybrid:improperlist", nstyles, maximproper_all, 5);
     auto d_improperlist = k_improperlist.d_view;
 
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index bfd9bba8aa..ae86e17b50 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -792,6 +792,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
 typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;
 
+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_dev t_float_4d;
+typedef tdual_float_4d::t_dev_const t_float_4d_const;
+typedef tdual_float_4d::t_dev_um t_float_4d_um;
+typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
@@ -1126,6 +1134,14 @@ typedef tdual_float_3d::t_host_um t_float_3d_um;
 typedef tdual_float_3d::t_host_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_host_const_randomread t_float_3d_randomread;
 
+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_host t_float_4d;
+typedef tdual_float_4d::t_host_const t_float_4d_const;
+typedef tdual_float_4d::t_host_um t_float_4d_um;
+typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h
index 026c8afcb4..a94d9eb1e6 100644
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@@ -101,6 +101,7 @@ template <typename TYPE, typename HTYPE>
 {
   data = TYPE(std::string(name),n1,n2);
   h_data = Kokkos::create_mirror_view(data);
+  //printf(">>> name: %s\n", name);
   return data;
 }
 
@@ -111,6 +112,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
   data = TYPE(std::string(name),n1,n2);
   bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
   array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  //printf(">>> name %s nbytes %d\n", name, nbytes);
 
   for (int i = 0; i < n1; i++) {
     if (n2 == 0)
@@ -121,6 +123,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
   return data;
 }
 
+/* ----------------------------------------------------------------------
+   create a 4d array with indices 2,3,4 offset, but not first
+   2nd index from n2lo to n2hi inclusive
+   3rd index from n3lo to n3hi inclusive
+   4th index from n4lo to n4hi inclusive
+   cannot grow it
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
+                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
+                             const char *name)
+{
+  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
+
+  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
+
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+  int n4 = n4hi - n4lo + 1;
+  data = TYPE(std::string(name),n1,n2,n3,n4);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
+  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
+
+  for (int i = 0; i < n1; i++) {
+    if (n2 == 0) {
+      array[i] = nullptr;
+    } else {
+      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
+      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
+      for (int j = 0; j < n2; j++){
+        if (n3 == 0){
+          array[i][j] = nullptr;
+        } else {
+          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
+          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
+          for (int k = 0; k < n3; k++){
+            if (n4 == 0)
+              array[i][j][k] = nullptr;
+            else
+              array[i][j][k] = &data.h_view(i,j,k,0);
+          }
+        }
+      }
+    }
+  }
+
+  return data;
+}
+
 template <typename TYPE, typename HTYPE>
   TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                      typename TYPE::value_type **&array, int n1, int n2,
@@ -221,15 +273,19 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type ***&array,
                    int n1, int n2, int n3, const char *name)
 {
   data = TYPE(std::string(name),n1,n2,n3);
-  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1 * n2;
+  typename TYPE::value_type **plane = (typename TYPE::value_type **) smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
   array = (typename TYPE::value_type ***) smalloc(nbytes,name);
 
+  bigint m;
   for (int i = 0; i < n1; i++) {
     if (n2 == 0) {
       array[i] = nullptr;
     } else {
-      nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n2;
-      array[i] = (typename TYPE::value_type **) smalloc(nbytes,name);
+      m = ((bigint) i) * n2;
+      array[i] = &plane[m];
+
       for (int j = 0; j < n2; j++) {
         if (n3 == 0)
            array[i][j] = nullptr;
@@ -248,15 +304,19 @@ template <typename TYPE, typename HTYPE>
 {
   data = TYPE(std::string(name),n1,n2);
   h_data = Kokkos::create_mirror_view(data);
-  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1 * n2;
+  typename TYPE::value_type **plane = (typename TYPE::value_type **) smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
   array = (typename TYPE::value_type ***) smalloc(nbytes,name);
 
+  bigint m;
   for (int i = 0; i < n1; i++) {
     if (n2 == 0) {
       array[i] = nullptr;
     } else {
-      nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n2;
-      array[i] = (typename TYPE::value_type **) smalloc(nbytes,name);
+      m = ((bigint) i) * n2;
+      array[i] = &plane[m];
+
       for (int j = 0; j < n2; j++) {
         if (n3 == 0)
            array[i][j] = nullptr;
@@ -288,15 +348,19 @@ TYPE grow_kokkos(TYPE &data, typename TYPE::value_type ***&array,
 {
   if (array == nullptr) return create_kokkos(data,array,n1,n2,n3,name);
   data.resize(n1,n2,n3);
-  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
-  array = (typename TYPE::value_type ***) smalloc(nbytes,name);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1 * n2;
+  typename TYPE::value_type **plane = (typename TYPE::value_type **) srealloc(array[0],nbytes,name);
+  nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n1;
+  array = (typename TYPE::value_type ***) srealloc(array,nbytes,name);
 
+  bigint m;
   for (int i = 0; i < n1; i++) {
     if (n2 == 0) {
       array[i] = nullptr;
     } else {
-      nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n2;
-      array[i] = (typename TYPE::value_type **) smalloc(nbytes,name);
+      m = ((bigint) i) * n2;
+      array[i] = &plane[m];
+
       for (int j = 0; j < n2; j++) {
         if (n3 == 0)
            array[i][j] = nullptr;
@@ -316,10 +380,9 @@ template <typename TYPE>
 void destroy_kokkos(TYPE data, typename TYPE::value_type*** &array)
 {
   if (array == nullptr) return;
-  int n1 = data.extent(0);
-  for (int i = 0; i < n1; ++i)
-    sfree(array[i]);
   data = TYPE();
+
+  sfree(array[0]);
   sfree(array);
   array = nullptr;
 }
@@ -411,7 +474,7 @@ template <typename TYPE>
 TYPE create_kokkos(TYPE &data, int n1, int n2, int n3, int n4, int n5 , int n6 ,const char *name)
 {
   data = TYPE();
-  data = TYPE(std::string(name) ,n1,n2,n3,n4,n5,n6);
+  data = TYPE(std::string(name),n1,n2,n3,n4,n5,n6);
   return data;
 }
 
@@ -420,4 +483,3 @@ TYPE create_kokkos(TYPE &data, int n1, int n2, int n3, int n4, int n5 , int n6 ,
 }
 
 #endif
-
diff --git a/src/KOKKOS/min_kokkos.cpp b/src/KOKKOS/min_kokkos.cpp
index 3460fe9009..fca7a16e62 100644
--- a/src/KOKKOS/min_kokkos.cpp
+++ b/src/KOKKOS/min_kokkos.cpp
@@ -73,10 +73,10 @@ void MinKokkos::init()
 void MinKokkos::setup(int flag)
 {
   if (comm->me == 0 && screen) {
-    fmt::print(screen,"Setting up {} style minimization ...\n", update->minimize_style);
+    utils::print(screen,"Setting up {} style minimization ...\n", update->minimize_style);
     if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n", update->unit_style);
-      fmt::print(screen,"  Current step  : {}\n", update->ntimestep);
+      utils::print(screen,"  Unit style    : {}\n", update->unit_style);
+      utils::print(screen,"  Current step  : {}\n", update->ntimestep);
       timer->print_timeout(screen);
     }
   }
diff --git a/src/KOKKOS/mliap_data_kokkos.cpp b/src/KOKKOS/mliap_data_kokkos.cpp
index fd5a852114..fd2859f802 100644
--- a/src/KOKKOS/mliap_data_kokkos.cpp
+++ b/src/KOKKOS/mliap_data_kokkos.cpp
@@ -145,13 +145,13 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
   auto type = atomKK->k_type.view<DeviceType>();
   auto map=k_pairmliap->k_map.template view<DeviceType>();
 
-  Kokkos::parallel_scan(natomneigh, KOKKOS_LAMBDA (int ii, int &update, const bool final) {
+  Kokkos::parallel_scan(Kokkos::RangePolicy<DeviceType>(0,natomneigh), KOKKOS_LAMBDA (int ii, int &update, const bool final) {
     if (final)
       d_ij(ii) = update;
     update += d_numneighs(ii);
   });
 
-  Kokkos::parallel_for(natomneigh, KOKKOS_LAMBDA (int ii)  {
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,natomneigh), KOKKOS_LAMBDA (int ii)  {
     int ij = d_ij(ii);
     const int i = d_ilist[ii];
     const double xtmp = x(i, 0);
@@ -183,7 +183,7 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
     d_ielems[ii] = ielem;
   });
 
-  Kokkos::parallel_for(nmax, KOKKOS_LAMBDA (int i)  {
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nmax), KOKKOS_LAMBDA (int i)  {
     const int itype = type(i);
     d_elems(i) = map(itype);
   });
@@ -225,7 +225,7 @@ void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
   auto d_cutsq=k_pairmliap->k_cutsq.template view<DeviceType>();
   auto h_cutsq=k_pairmliap->k_cutsq.template view<LMPHostType>();
   auto d_numneighs = k_numneighs.template view<DeviceType>();
-  Kokkos::parallel_reduce(natomneigh, KOKKOS_LAMBDA (int ii, int &contrib) {
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,natomneigh), KOKKOS_LAMBDA (int ii, int &contrib) {
     const int i = d_ilist[ii];
     int count=0;
     const double xtmp = x(i, 0);
diff --git a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
index 1cf368e952..9f18078ac6 100644
--- a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
+++ b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
@@ -75,7 +75,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_forces(class MLIAPData *data_
   Kokkos::View<double[6], DeviceType> virial("virial");
   data->k_pairmliap->k_vatom.template modify<LMPHostType>();
   data->k_pairmliap->k_vatom.template sync<DeviceType>();
-  Kokkos::parallel_for(data->nlistatoms, KOKKOS_LAMBDA(int ii) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,data->nlistatoms), KOKKOS_LAMBDA(int ii) {
     double fij[3];
     const int i = d_iatoms(ii);
 
@@ -187,7 +187,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_force_gradients(class MLIAPDa
 
   auto yoffset = data->yoffset, zoffset = data->zoffset, gamma_nnz = data->gamma_nnz;
 
-  Kokkos::parallel_for (data->nlistatoms, KOKKOS_LAMBDA (int ii) {
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType>(0,data->nlistatoms), KOKKOS_LAMBDA (int ii) {
     const int i = d_iatoms(ii);
 
     // ensure rij, inside, wj, and rcutij are of size jnum
diff --git a/src/KOKKOS/pair_meam_kokkos.cpp b/src/KOKKOS/pair_meam_kokkos.cpp
index 9082c410e0..9852727855 100644
--- a/src/KOKKOS/pair_meam_kokkos.cpp
+++ b/src/KOKKOS/pair_meam_kokkos.cpp
@@ -147,7 +147,7 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     auto l_numneigh_half = d_numneigh_half;
     auto l_offset = d_offset;
 
-    Kokkos::parallel_scan(inum_half, LAMMPS_LAMBDA(int ii, int &m_fill, bool final) {
+    Kokkos::parallel_scan(Kokkos::RangePolicy<DeviceType>(0,inum_half), LAMMPS_LAMBDA(int ii, int &m_fill, bool final) {
       int i = l_ilist_half[ii];
       m_fill += l_numneigh_half[i];
       if (final)
diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp
index 5739a8ea2e..599c49f523 100644
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@@ -240,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
       if (strcmp(elemname,descriptor->elements[jelem]) == 0)
         break;
 
+    //printf(">>> nelements: %d\n", descriptor->nelements);
     if (jelem < descriptor->nelements)
       map[i] = jelem;
     else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
@@ -302,7 +303,7 @@ void PairMLIAPKokkos<DeviceType>::e_tally(MLIAPData* data)
     auto d_iatoms = k_data->k_iatoms.template view<DeviceType>();
     auto d_eatoms = k_data->k_eatoms.template view<DeviceType>();
     auto d_eatom = k_eatom.template view<DeviceType>();
-    Kokkos::parallel_for(data->nlistatoms, KOKKOS_LAMBDA (int ii) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,data->nlistatoms), KOKKOS_LAMBDA (int ii) {
       d_eatom(d_iatoms(ii)) = d_eatoms(ii);
     });
     k_eatom.modify<DeviceType>();
diff --git a/src/KOKKOS/pair_pod_kokkos.cpp b/src/KOKKOS/pair_pod_kokkos.cpp
index b0cab5a1df..85b9802034 100644
--- a/src/KOKKOS/pair_pod_kokkos.cpp
+++ b/src/KOKKOS/pair_pod_kokkos.cpp
@@ -532,7 +532,7 @@ int PairPODKokkos<DeviceType>::NeighborCount(t_pod_1i l_numij, double l_rcutsq,
   auto l_neighbors = d_neighbors;
 
   // compute number of pairs for each atom i
-  Kokkos::parallel_for("NeighborCount", Kokkos::TeamPolicy<>(Ni, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) {
+  Kokkos::parallel_for("NeighborCount", typename Kokkos::TeamPolicy<DeviceType>(Ni, Kokkos::AUTO), KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
     int i = team.league_rank();
     int gi = l_ilist(gi1 + i);
     double xi0 = l_x(gi, 0);
@@ -555,7 +555,7 @@ int PairPODKokkos<DeviceType>::NeighborCount(t_pod_1i l_numij, double l_rcutsq,
   });
 
   // accumalative sum
-  Kokkos::parallel_scan("InclusivePrefixSum", Ni + 1, KOKKOS_LAMBDA(int i, int& update, const bool final) {
+  Kokkos::parallel_scan("InclusivePrefixSum", Kokkos::RangePolicy<DeviceType>(0,Ni + 1), KOKKOS_LAMBDA(int i, int& update, const bool final) {
     if (i > 0) {
       update += l_numij(i);
       if (final) {
@@ -582,7 +582,7 @@ void PairPODKokkos<DeviceType>::NeighborList(t_pod_1d l_rij, t_pod_1i l_numij,
   auto l_map = d_map;
   auto l_type = type;
 
-  Kokkos::parallel_for("NeighborList", Kokkos::TeamPolicy<>(Ni, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) {
+  Kokkos::parallel_for("NeighborList", typename Kokkos::TeamPolicy<DeviceType>(Ni, Kokkos::AUTO), KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
     int i = team.league_rank();
     int gi = l_ilist(gi1 + i);
     double xi0 = l_x(gi, 0);
@@ -622,7 +622,7 @@ void PairPODKokkos<DeviceType>::radialbasis(t_pod_1d rbft, t_pod_1d rbftx, t_pod
     t_pod_1d l_rij, t_pod_1d l_besselparams, double l_rin, double l_rmax, int l_besseldegree,
     int l_inversedegree, int l_nbesselpars, int Nij)
 {
-  Kokkos::parallel_for("ComputeRadialBasis", Nij, KOKKOS_LAMBDA(int n) {
+  Kokkos::parallel_for("ComputeRadialBasis", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int n) {
     double xij1 = l_rij(0+3*n);
     double xij2 = l_rij(1+3*n);
     double xij3 = l_rij(2+3*n);
@@ -722,7 +722,7 @@ void PairPODKokkos<DeviceType>::radialbasis(t_pod_1d rbft, t_pod_1d rbftx, t_pod
 template<class DeviceType>
 void PairPODKokkos<DeviceType>::matrixMultiply(t_pod_1d a, t_pod_1d b, t_pod_1d c, int r1, int c1, int c2)
 {
-    Kokkos::parallel_for("MatrixMultiply", r1 * c2, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("MatrixMultiply", Kokkos::RangePolicy<DeviceType>(0,r1 * c2), KOKKOS_LAMBDA(int idx) {
         int j = idx / r1;  // Calculate column index
         int i = idx % r1;  // Calculate row index
         double sum = 0.0;
@@ -737,7 +737,7 @@ template<class DeviceType>
 void PairPODKokkos<DeviceType>::angularbasis(t_pod_1d l_abf, t_pod_1d l_abfx, t_pod_1d l_abfy, t_pod_1d l_abfz,
         t_pod_1d l_rij, t_pod_1i l_pq3, int l_K3, int N)
 {
-  Kokkos::parallel_for("AngularBasis", N, KOKKOS_LAMBDA(int j) {
+  Kokkos::parallel_for("AngularBasis", Kokkos::RangePolicy<DeviceType>(0,N), KOKKOS_LAMBDA(int j) {
     double x = l_rij(j*3 + 0);
     double y = l_rij(j*3 + 1);
     double z = l_rij(j*3 + 2);
@@ -817,7 +817,7 @@ void PairPODKokkos<DeviceType>::radialangularsum(t_pod_1d l_sumU, t_pod_1d l_rbf
 {
   int totalIterations = l_nrbf3 * l_K3 * Ni;
   if (l_nelements==1) {
-    Kokkos::parallel_for("RadialAngularSum", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("RadialAngularSum", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int k = idx % l_K3;
       int temp = idx / l_K3;
       int m = temp % l_nrbf3;
@@ -835,7 +835,7 @@ void PairPODKokkos<DeviceType>::radialangularsum(t_pod_1d l_sumU, t_pod_1d l_rbf
     });
   }
   else {
-    Kokkos::parallel_for("RadialAngularSum", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("RadialAngularSum", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int k = idx % l_K3;
       int temp = idx / l_K3;
       int m = temp % l_nrbf3;
@@ -863,7 +863,7 @@ void PairPODKokkos<DeviceType>::twobodydesc(t_pod_1d d2,  t_pod_1d l_rbf, t_pod_
         int l_nrbf2, const int Ni, const int Nij)
 {
   int totalIterations = l_nrbf2 * Nij;
-  Kokkos::parallel_for("twobodydesc", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("twobodydesc", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx / l_nrbf2; // pair index
     int m = idx % l_nrbf2; // rbd index
     int i2 = n + Nij * m; // Index of the radial basis function for atom n and RBF m
@@ -876,7 +876,7 @@ void PairPODKokkos<DeviceType>::twobody_forces(t_pod_1d fij, t_pod_1d cb2, t_pod
         t_pod_1d l_rbfz, t_pod_1i l_idxi, t_pod_1i l_tj, int l_nrbf2, const int Ni, const int Nij)
 {
   int totalIterations = l_nrbf2 * Nij;
-  Kokkos::parallel_for("twobody_forces", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("twobody_forces", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx / l_nrbf2; // pair index
     int m = idx % l_nrbf2; // rbd index
     int i2 = n + Nij * m; // Index of the radial basis function for atom n and RBF m
@@ -893,7 +893,7 @@ void PairPODKokkos<DeviceType>::threebodydesc(t_pod_1d d3, t_pod_1d l_sumU, t_po
         int l_nelements, int l_nrbf3, int l_nabf3, int l_K3, const int Ni)
 {
   int totalIterations = l_nrbf3 * Ni;
-  Kokkos::parallel_for("ThreeBodyDesc", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("ThreeBodyDesc", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int m = idx % l_nrbf3;
     int i = idx / l_nrbf3;
     int nmi = l_nelements * l_K3 * m + l_nelements * l_K3 * l_nrbf3*i;
@@ -925,7 +925,7 @@ void PairPODKokkos<DeviceType>::threebody_forces(t_pod_1d fij, t_pod_1d cb3, t_p
 {
   int totalIterations = l_nrbf3 * Nij;
   if (l_nelements==1) {
-    Kokkos::parallel_for("threebody_forces1", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("threebody_forces1", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int j = idx / l_nrbf3;       // Calculate j using integer division
       int m = idx % l_nrbf3;       // Calculate m using modulo operation
       int idxR = j + Nij * m;  // Pre-compute the index for rbf
@@ -961,7 +961,7 @@ void PairPODKokkos<DeviceType>::threebody_forces(t_pod_1d fij, t_pod_1d cb3, t_p
   }
   else {
     int N3 = Ni *  l_nabf3 * l_nrbf3;
-    Kokkos::parallel_for("threebody_forces2", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("threebody_forces2", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int j = idx / l_nrbf3;  // Derive the original j value
       int m = idx % l_nrbf3;  // Derive the original m value
       int i2 = l_tj(j) - 1;
@@ -1007,7 +1007,7 @@ void PairPODKokkos<DeviceType>::threebody_forcecoeff(t_pod_1d fb3, t_pod_1d cb3,
 {
   int totalIterations = l_nrbf3 * Ni;
   if (l_nelements==1) {
-    Kokkos::parallel_for("threebody_forcecoeff1", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("threebody_forcecoeff1", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int i = idx / l_nrbf3;       // Calculate j using integer division
       int m = idx % l_nrbf3;       // Calculate m using modulo operation
       for (int p = 0; p < l_nabf3; p++) {
@@ -1024,7 +1024,7 @@ void PairPODKokkos<DeviceType>::threebody_forcecoeff(t_pod_1d fb3, t_pod_1d cb3,
   }
   else {
     int N3 = Ni *  l_nabf3 * l_nrbf3;
-    Kokkos::parallel_for("threebody_forcecoeff2", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("threebody_forcecoeff2", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int i = idx / l_nrbf3;  // Derive the original j value
       int m = idx % l_nrbf3;  // Derive the original m value
       for (int p = 0; p < l_nabf3; p++) {
@@ -1054,7 +1054,7 @@ void PairPODKokkos<DeviceType>::fourbodydesc(t_pod_1d d4,  t_pod_1d l_sumU, t_po
     t_pod_1i l_pc4, int l_nelements, int l_nrbf3, int l_nrbf4, int l_nabf4, int l_K3, int l_Q4, int Ni)
 {
   int totalIterations = l_nrbf4 * Ni;
-  Kokkos::parallel_for("fourbodydesc", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("fourbodydesc", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int m = idx % l_nrbf4;
     int i = idx / l_nrbf4;
     int idxU = l_nelements * l_K3 * m + l_nelements * l_K3 * l_nrbf3 * i;
@@ -1092,7 +1092,7 @@ void PairPODKokkos<DeviceType>::fourbody_forces(t_pod_1d fij, t_pod_1d cb4, t_po
 {
   int totalIterations = l_nrbf4 * Nij;
   if (l_nelements==1) {
-    Kokkos::parallel_for("fourbody_forces1", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("fourbody_forces1", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int j = idx / l_nrbf4;  // Derive the original j value
       int m = idx % l_nrbf4;  // Derive the original m value
       int idxU = l_K3 * m + l_K3*l_nrbf3*l_idxi(j);
@@ -1151,7 +1151,7 @@ void PairPODKokkos<DeviceType>::fourbody_forces(t_pod_1d fij, t_pod_1d cb4, t_po
   }
   else {
     int N3 = Ni * l_nabf4 * l_nrbf4;
-    Kokkos::parallel_for("fourbody_forces2", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("fourbody_forces2", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int j = idx / l_nrbf4;  // Derive the original j value
       int m = idx % l_nrbf4;  // Derive the original m value
       int idxM = j + Nij * m;
@@ -1241,7 +1241,7 @@ void PairPODKokkos<DeviceType>::fourbody_forcecoeff(t_pod_1d fb4, t_pod_1d cb4,
 {
   int totalIterations = l_nrbf4 * Ni;
   if (l_nelements==1) {
-    Kokkos::parallel_for("fourbody_forcecoeff1", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("fourbody_forcecoeff1", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int i = idx / l_nrbf4;  // Derive the original j value
       int m = idx % l_nrbf4;  // Derive the original m value
       int idxU = l_K3 * m + l_K3*l_nrbf3*i;
@@ -1268,7 +1268,7 @@ void PairPODKokkos<DeviceType>::fourbody_forcecoeff(t_pod_1d fb4, t_pod_1d cb4,
   }
   else {
     int N3 = Ni * l_nabf4 * l_nrbf4;
-    Kokkos::parallel_for("fourbody_forcecoeff2", totalIterations, KOKKOS_LAMBDA(int idx) {
+    Kokkos::parallel_for("fourbody_forcecoeff2", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
       int i = idx / l_nrbf4;  // Derive the original j value
       int m = idx % l_nrbf4;  // Derive the original m value
       for (int p = 0; p < l_nabf4; p++)  {
@@ -1311,7 +1311,7 @@ void PairPODKokkos<DeviceType>::allbody_forces(t_pod_1d fij, t_pod_1d l_forcecoe
     t_pod_1i l_idxi, t_pod_1i l_tj, int l_nelements, int l_nrbf3, int l_K3, int Nij)
 {
   int totalIterations = l_nrbf3 * Nij;
-  Kokkos::parallel_for("allbody_forces", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("allbody_forces", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int j = idx / l_nrbf3;       // Calculate j using integer division
     int m = idx % l_nrbf3;       // Calculate m using modulo operation
     int i2 = l_tj(j) - 1;
@@ -1346,7 +1346,7 @@ template<class DeviceType>
 void PairPODKokkos<DeviceType>::crossdesc(t_pod_1d d12, t_pod_1d d1, t_pod_1d d2, t_pod_1i ind1, t_pod_1i ind2, int n12, int Ni)
 {
   int totalIterations = n12 * Ni;
-  Kokkos::parallel_for("crossdesc", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("crossdesc", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx % Ni;
     int i = idx / Ni;
 
@@ -1359,7 +1359,7 @@ void PairPODKokkos<DeviceType>::crossdesc_reduction(t_pod_1d cb1, t_pod_1d cb2,
         t_pod_1d d2, t_pod_1i ind1, t_pod_1i ind2, int n12, int Ni)
 {
   int totalIterations = n12 * Ni;
-  Kokkos::parallel_for("crossdesc_reduction", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("crossdesc_reduction", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx % Ni; // Ni
     int m = idx / Ni; // n12
     int k1 = ind1(m); // dd1
@@ -1375,7 +1375,7 @@ void PairPODKokkos<DeviceType>::crossdesc_reduction(t_pod_1d cb1, t_pod_1d cb2,
 template<class DeviceType>
 void PairPODKokkos<DeviceType>::set_array_to_zero(t_pod_1d a, int N)
 {
-  Kokkos::parallel_for("initialize_array", N, KOKKOS_LAMBDA(int i) {
+  Kokkos::parallel_for("initialize_array", Kokkos::RangePolicy<DeviceType>(0,N), KOKKOS_LAMBDA(int i) {
     a(i) = 0.0;
   });
 }
@@ -1480,7 +1480,7 @@ void PairPODKokkos<DeviceType>::blockatom_base_coefficients(t_pod_1d ei, t_pod_1
   int nDes = Mdesc;
   int nCoeff = nCoeffPerElement;
 
-  Kokkos::parallel_for("atomic_energies", Ni, KOKKOS_LAMBDA(int n) {
+  Kokkos::parallel_for("atomic_energies", Kokkos::RangePolicy<DeviceType>(0,Ni), KOKKOS_LAMBDA(int n) {
     int nc = nCoeff*(tyai[n]-1);
     ei[n] = cefs[0 + nc];
     for (int m=0; m<nDes; m++)
@@ -1488,7 +1488,7 @@ void PairPODKokkos<DeviceType>::blockatom_base_coefficients(t_pod_1d ei, t_pod_1
   });
 
   int totalIterations = Ni*nDes;
-  Kokkos::parallel_for("base_coefficients", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("base_coefficients", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx % Ni;
     int m = idx / Ni;
     int nc = nCoeff*(tyai[n]-1);
@@ -1516,7 +1516,7 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
   int nCoeff = nCoeffPerElement;
 
   int totalIterations = Ni*nCom;
-  Kokkos::parallel_for("pca", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("pca", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int i = idx % Ni;
     int k = idx / Ni;
     double sum = 0.0;
@@ -1528,7 +1528,7 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
   });
 
   totalIterations = Ni*nCls;
-  Kokkos::parallel_for("inverse_square_distances", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("inverse_square_distances", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int i = idx % Ni;
     int j = idx / Ni;
     int typei = tyai[i]-1;
@@ -1541,14 +1541,14 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
     D[i + Ni*j] = 1.0 / sum;
   });
 
-  Kokkos::parallel_for("Probabilities", Ni, KOKKOS_LAMBDA(int i) {
+  Kokkos::parallel_for("Probabilities", Kokkos::RangePolicy<DeviceType>(0,Ni), KOKKOS_LAMBDA(int i) {
     double sum = 0;
     for (int j = 0; j < nCls; j++) sum += D[i + Ni*j];
     sumD[i] = sum;
     for (int j = 0; j < nCls; j++) P[i + Ni*j] = D[i + Ni*j]/sum;
   });
 
-  Kokkos::parallel_for("atomic_energies", Ni, KOKKOS_LAMBDA(int n) {
+  Kokkos::parallel_for("atomic_energies", Kokkos::RangePolicy<DeviceType>(0,Ni), KOKKOS_LAMBDA(int n) {
     int nc = nCoeff*(tyai[n]-1);
     ei[n] = cefs[0 + nc];
     for (int k = 0; k<nCls; k++)
@@ -1556,7 +1556,7 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
         ei[n] += cefs[1 + m + nDes*k + nc]*B[n + Ni*m]*P[n + Ni*k];
   });
 
-  Kokkos::parallel_for("env_coefficients", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("env_coefficients", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx % Ni;
     int k = idx / Ni;
     int nc = nCoeff*(tyai[n]-1);
@@ -1567,7 +1567,7 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
   });
 
   totalIterations = Ni*nDes;
-  Kokkos::parallel_for("base_coefficients", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("base_coefficients", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int n = idx % Ni;
     int m = idx / Ni;
     int nc = nCoeff*(tyai[n]-1);
@@ -1577,7 +1577,7 @@ void PairPODKokkos<DeviceType>::blockatom_environment_descriptors(t_pod_1d ei, t
     cb[n + Ni*m] = sum;
   });
 
-  Kokkos::parallel_for("base_env_coefficients", totalIterations, KOKKOS_LAMBDA(int idx) {
+  Kokkos::parallel_for("base_env_coefficients", Kokkos::RangePolicy<DeviceType>(0,totalIterations), KOKKOS_LAMBDA(int idx) {
     int i = idx % Ni;
     int m = idx / Ni;
     int typei = tyai[i]-1;
@@ -1670,7 +1670,7 @@ template<class DeviceType>
 void PairPODKokkos<DeviceType>::tallyforce(t_pod_1d l_fij, t_pod_1i l_ai, t_pod_1i l_aj, int Nij)
 {
   auto l_f = f;
-  Kokkos::parallel_for("TallyForce", Nij, KOKKOS_LAMBDA(int n) {
+  Kokkos::parallel_for("TallyForce", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int n) {
     int im = l_ai(n);
     int jm = l_aj(n);
     int n3 = 3*n;
@@ -1694,7 +1694,7 @@ void PairPODKokkos<DeviceType>::tallyenergy(t_pod_1d l_ei, int istart, int Ni)
   // For global energy tally
   if (eflag_global) {
     double local_eng_vdwl = 0.0;
-    Kokkos::parallel_reduce("GlobalEnergyTally", Ni, KOKKOS_LAMBDA(int k, E_FLOAT& update) {
+    Kokkos::parallel_reduce("GlobalEnergyTally", Kokkos::RangePolicy<DeviceType>(0,Ni), KOKKOS_LAMBDA(int k, E_FLOAT& update) {
       update += l_ei(k);
     }, local_eng_vdwl);
 
@@ -1704,7 +1704,7 @@ void PairPODKokkos<DeviceType>::tallyenergy(t_pod_1d l_ei, int istart, int Ni)
 
   // For per-atom energy tally
   if (eflag_atom) {
-    Kokkos::parallel_for("PerAtomEnergyTally", Ni, KOKKOS_LAMBDA(int k) {
+    Kokkos::parallel_for("PerAtomEnergyTally", Kokkos::RangePolicy<DeviceType>(0,Ni), KOKKOS_LAMBDA(int k) {
       l_eatom(istart + k) += l_ei(k);
     });
   }
@@ -1718,7 +1718,7 @@ void PairPODKokkos<DeviceType>::tallystress(t_pod_1d l_fij, t_pod_1d l_rij, t_po
   if (vflag_global) {
     for (int j=0; j<3; j++) {
       F_FLOAT sum = 0.0;
-      Kokkos::parallel_reduce("GlobalStressTally", Nij, KOKKOS_LAMBDA(int k, F_FLOAT& update) {
+      Kokkos::parallel_reduce("GlobalStressTally", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int k, F_FLOAT& update) {
         int k3 = 3*k;
         update += l_rij(j + k3) * l_fij(j + k3);
       }, sum);
@@ -1726,21 +1726,21 @@ void PairPODKokkos<DeviceType>::tallystress(t_pod_1d l_fij, t_pod_1d l_rij, t_po
     }
 
     F_FLOAT sum = 0.0;
-    Kokkos::parallel_reduce("GlobalStressTally", Nij, KOKKOS_LAMBDA(int k, F_FLOAT& update) {
+    Kokkos::parallel_reduce("GlobalStressTally", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int k, F_FLOAT& update) {
       int k3 = 3*k;
       update += l_rij(k3) * l_fij(1 + k3);
     }, sum);
     virial[3] -= sum;
 
     sum = 0.0;
-    Kokkos::parallel_reduce("GlobalStressTally", Nij, KOKKOS_LAMBDA(int k, F_FLOAT& update) {
+    Kokkos::parallel_reduce("GlobalStressTally", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int k, F_FLOAT& update) {
       int k3 = 3*k;
       update += l_rij(k3) * l_fij(2 + k3);
     }, sum);
     virial[4] -= sum;
 
     sum = 0.0;
-    Kokkos::parallel_reduce("GlobalStressTally", Nij, KOKKOS_LAMBDA(int k, F_FLOAT& update) {
+    Kokkos::parallel_reduce("GlobalStressTally", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int k, F_FLOAT& update) {
       int k3 = 3*k;
       update += l_rij(1+k3) * l_fij(2+k3);
     }, sum);
@@ -1748,7 +1748,7 @@ void PairPODKokkos<DeviceType>::tallystress(t_pod_1d l_fij, t_pod_1d l_rij, t_po
   }
 
   if (vflag_atom) {
-    Kokkos::parallel_for("PerAtomStressTally", Nij, KOKKOS_LAMBDA(int k) {
+    Kokkos::parallel_for("PerAtomStressTally", Kokkos::RangePolicy<DeviceType>(0,Nij), KOKKOS_LAMBDA(int k) {
       int i = l_ai(k);
       int j = l_aj(k);
       int k3 = 3*k;
diff --git a/src/KOKKOS/pair_reaxff_kokkos.cpp b/src/KOKKOS/pair_reaxff_kokkos.cpp
index b0a53a27fd..85bd139bfb 100644
--- a/src/KOKKOS/pair_reaxff_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxff_kokkos.cpp
@@ -385,13 +385,13 @@ void PairReaxFFKokkos<DeviceType>::init_md()
   swb = api->control->nonb_cut;
   enobondsflag = api->control->enobondsflag;
 
-  if (fabs(swa) > 0.01)
-    error->warning(FLERR,"Warning: non-zero lower Taper-radius cutoff");
+  if ((fabs(swa) > 0.01) && (comm->me == 0))
+    error->warning(FLERR, "Non-zero lower Taper-radius cutoff");
 
-  if (swb < 0)
-    error->one(FLERR,"Negative upper Taper-radius cutoff");
-  else if (swb < 5)
-    error->one(FLERR,"Warning: very low Taper-radius cutoff: {}\n", swb);
+  if (swb < 0.0) {
+    error->all(FLERR,"Negative upper Taper-radius cutoff");
+  } else if ((swb < 5.0) && (comm->me ==0))
+    error->warning(FLERR,"Very low Taper-radius cutoff: {}\n", swb);
 
   d1 = swb - swa;
   d7 = powint(d1,7);
diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index 660503eed8..4dc4029d12 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP {
 
   // Make SNAKokkos a friend
   friend class SNAKokkos<DeviceType, real_type, vector_length>;
-
 };
 
 
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 2b9b862645..17ce8e1c9d 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -3,12 +3,10 @@
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
    LAMMPS development team: developers@lammps.org
-
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
-
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
@@ -39,17 +37,6 @@
 
 namespace LAMMPS_NS {
 
-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
-//static double t1 = 0.0;
-//static double t2 = 0.0;
-//static double t3 = 0.0;
-//static double t4 = 0.0;
-//static double t5 = 0.0;
-//static double t6 = 0.0;
-//static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
@@ -219,7 +206,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
       // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
       int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
 
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh>
+        policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
       policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
     }
@@ -259,7 +247,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
         const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
         const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
       } else {
@@ -269,7 +258,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
         const int n_teams = chunk_size_div * max_neighs;
         const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
       }
@@ -536,8 +526,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
   Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
   Kokkos::deep_copy(d_map,h_map);
 
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this); //rfac0,twojmax,
-    //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
   snaKK.grow_rij(0,0);
   snaKK.init();
 }
diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.cpp b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
index efdd932987..eb2caf879f 100644
--- a/src/KOKKOS/rand_pool_wrap_kokkos.cpp
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
@@ -39,7 +39,7 @@ RandPoolWrap::~RandPoolWrap()
 void RandPoolWrap::destroy()
 {
   if (random_thr) {
-    for (int i=1; i < nthreads; ++i)
+    for (int i = 1; i < nthreads; ++i)
       delete random_thr[i];
 
     delete[] random_thr;
@@ -51,7 +51,7 @@ void RandPoolWrap::init(RanMars* random, int seed)
 {
   // deallocate pool of RNGs
   if (random_thr) {
-    for (int i=1; i < this->nthreads; ++i)
+    for (int i = 1; i < nthreads; ++i)
       delete random_thr[i];
 
     delete[] random_thr;
diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.h b/src/KOKKOS/rand_pool_wrap_kokkos.h
index f79a6a1caa..60e9776039 100644
--- a/src/KOKKOS/rand_pool_wrap_kokkos.h
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.h
@@ -59,10 +59,12 @@ class RandPoolWrap : protected Pointers {
     typedef Kokkos::Experimental::UniqueToken<
       LMPHostType, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_type;
 
+#ifndef LMP_KOKKOS_GPU
     unique_token_type unique_token;
     int tid = (int) unique_token.acquire();
     rand_wrap.rng = random_thr[tid];
     unique_token.release(tid);
+#endif
 
     return rand_wrap;
   }
diff --git a/src/KOKKOS/region_sphere_kokkos.h b/src/KOKKOS/region_sphere_kokkos.h
index 08951138c3..575ca9c2d3 100644
--- a/src/KOKKOS/region_sphere_kokkos.h
+++ b/src/KOKKOS/region_sphere_kokkos.h
@@ -63,10 +63,9 @@ class RegSphereKokkos : public RegSphere, public KokkosBase  {
     double xs, ys, zs;
     double xnear[3], xorig[3];
 
-    if (dynamic) {
-      xorig[0] = x; xorig[1] = y; xorig[2] = z;
+    xorig[0] = x; xorig[1] = y; xorig[2] = z;
+    if (dynamic)
       inverse_transform(x, y, z);
-    }
 
     xnear[0] = x; xnear[1] = y; xnear[2] = z;
 
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index a438ccd25e..61aebaf97d 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -29,7 +29,9 @@
 #endif
 
 namespace LAMMPS_NS {
-
+// copied from pair_snap_kokkos.h
+// pre-declare so sna_kokkos.h can refer to it
+template<class DeviceType, typename real_type_, int vector_length_> class PairSNAPKokkos;
 template<typename real_type_, int vector_length_>
 struct WignerWrapper {
   using real_type = real_type_;
@@ -170,9 +172,9 @@ class SNAKokkos {
   KOKKOS_INLINE_FUNCTION
   SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
 
+  template<class CopyClass>
   inline
-  //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int);
-  SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>&);
+  SNAKokkos(const CopyClass&);
 
   KOKKOS_INLINE_FUNCTION
   ~SNAKokkos();
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 9a97f229b5..622ef0b8ae 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -29,17 +29,18 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 static const double MY_PI2  = 1.57079632679489661923; // pi/2
 
 template<class DeviceType, typename real_type, int vector_length>
+template<class CopyClass>
 inline
-SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>& psk)
-  : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag),
-    bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag),
-    wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag),
-    quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem)
+SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const CopyClass& copy)
+  : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0),
+    rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag),
+    chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag),
+    quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag)
 {
   wself = static_cast<real_type>(1.0);
 
   if (chem_flag)
-    nelements = psk.nelements;
+    nelements = copy.nelements;
   else
     nelements = 1;
 
@@ -611,7 +612,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
     }
 
     ulist_wrapper.set(ma, ulist_accum);
-
     mb++;
   }
 
@@ -830,7 +830,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
   int jju1 = idxu_block[j1] + (j1+1)*mb1min;
   int jju2 = idxu_block[j2] + (j2+1)*mb2max;
   int icgb = mb1min*(j2+1) + mb2max;
-
   #ifdef LMP_KK_DEVICE_COMPILE
   #pragma unroll
   #endif
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index d839362aa5..a492b5a9c9 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -77,7 +77,7 @@ void VerletKokkos::setup(int flag)
   if (comm->me == 0 && screen) {
     fputs("Setting up Verlet run ...\n",screen);
     if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n"
+      utils::print(screen,"  Unit style    : {}\n"
                         "  Current step  : {}\n"
                         "  Time step     : {}\n",
                  update->unit_style,update->ntimestep,update->dt);
diff --git a/src/KSPACE/pair_lj_long_coul_long.cpp b/src/KSPACE/pair_lj_long_coul_long.cpp
index 5f3c0327db..2955b24fe8 100644
--- a/src/KSPACE/pair_lj_long_coul_long.cpp
+++ b/src/KSPACE/pair_lj_long_coul_long.cpp
@@ -410,7 +410,7 @@ void PairLJLongCoulLong::read_restart_settings(FILE *fp)
 void PairLJLongCoulLong::write_data(FILE *fp)
 {
   for (int i = 1; i <= atom->ntypes; i++)
-    fmt::print(fp,"{} {} {}\n",i,epsilon_read[i][i],sigma_read[i][i]);
+    utils::print(fp,"{} {} {}\n",i,epsilon_read[i][i],sigma_read[i][i]);
 }
 
 /* ----------------------------------------------------------------------
@@ -423,10 +423,10 @@ void PairLJLongCoulLong::write_data_all(FILE *fp)
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (ewald_order & (1<<6)) {
-        fmt::print(fp,"{} {} {} {}\n",i,j,
+        utils::print(fp,"{} {} {} {}\n",i,j,
                    epsilon[i][j],sigma[i][j]);
       } else {
-        fmt::print(fp,"{} {} {} {} {}\n",i,j,
+        utils::print(fp,"{} {} {} {} {}\n",i,j,
                    epsilon[i][j],sigma[i][j],cut_lj[i][j]);
       }
     }
diff --git a/src/LATBOLTZ/fix_lb_fluid.cpp b/src/LATBOLTZ/fix_lb_fluid.cpp
index 773f60cf0e..286b56cab5 100644
--- a/src/LATBOLTZ/fix_lb_fluid.cpp
+++ b/src/LATBOLTZ/fix_lb_fluid.cpp
@@ -2364,7 +2364,7 @@ void FixLbFluid::dump(const bigint step)
       bigint offset = frameindex * block * (1 + 3);
       double time = dump_time_index ? update->ntimestep * dt_lb : frameindex;
 
-      fmt::print(dump_file_handle_xdmf,
+      utils::print(dump_file_handle_xdmf,
                  "      <Grid Name=\"{}\">\n"
                  "        <Time Value=\"{:f}\"/>\n\n"
                  "        <Topology TopologyType=\"3DCoRectMesh\" Dimensions=\"{} {} {}\"/>\n"
@@ -2378,7 +2378,7 @@ void FixLbFluid::dump(const bigint step)
                  "        </Geometry>\n\n",
                  step, time, fluid_global_n0[2], fluid_global_n0[1], fluid_global_n0[0],
                  domain->boxlo[2], domain->boxlo[1], domain->boxlo[0], dx_lb, dx_lb, dx_lb);
-      fmt::print(dump_file_handle_xdmf,
+      utils::print(dump_file_handle_xdmf,
                  "        <Attribute Name=\"density\">\n"
                  "          <DataItem ItemType=\"Function\" Function=\"$0 * {:f}\" "
                  "Dimensions=\"{} {} {}\">\n"
@@ -2391,7 +2391,7 @@ void FixLbFluid::dump(const bigint step)
                  dm_lb / (dx_lb * dx_lb * dx_lb), fluid_global_n0[2], fluid_global_n0[1],
                  fluid_global_n0[0], sizeof(double), offset, fluid_global_n0[2], fluid_global_n0[1],
                  fluid_global_n0[0], dump_file_name_raw.c_str());
-      fmt::print(dump_file_handle_xdmf,
+      utils::print(dump_file_handle_xdmf,
                  "        <Attribute Name=\"velocity\" AttributeType=\"Vector\">\n"
                  "          <DataItem ItemType=\"Function\" Function=\"$0 * {:f}\" "
                  "Dimensions=\"{} {} {} 3\">\n"
@@ -2404,7 +2404,7 @@ void FixLbFluid::dump(const bigint step)
                  dx_lb / dt_lb, fluid_global_n0[2], fluid_global_n0[1], fluid_global_n0[0],
                  sizeof(double), offset + block * 1, fluid_global_n0[2], fluid_global_n0[1],
                  fluid_global_n0[0], dump_file_name_raw.c_str());
-      fmt::print(dump_file_handle_xdmf, "      </Grid>\n\n");
+      utils::print(dump_file_handle_xdmf, "      </Grid>\n\n");
 
       frameindex++;
     }
@@ -3726,18 +3726,18 @@ void FixLbFluid::initializeGeometry()
   if (!outfile)
     error->one(FLERR, " file {} could not be opened: {}", datfile, utils::getsyserror());
 
-  fmt::print(outfile, "\n me: {} px: {} py: {} pz: {}\n", me, comm->myloc[0], comm->myloc[1],
+  utils::print(outfile, "\n me: {} px: {} py: {} pz: {}\n", me, comm->myloc[0], comm->myloc[1],
              comm->myloc[2]);
 
   for (i = 0; i < subNbx; i++) {
-    fmt::print(outfile, "i={}\n", i);
+    utils::print(outfile, "i={}\n", i);
     for (k = subNbz - 1; k > -1; k--) {
       if (k == subNbz - 2 || k == 0) {
         for (j = 0; j < subNby + 2; j++) fputs("---", outfile);
         fputs("\n", outfile);
       }
       for (j = 0; j < subNby; j++) {
-        fmt::print(outfile, " {} ", sublattice[i][j][k].type);
+        utils::print(outfile, " {} ", sublattice[i][j][k].type);
         if (j == 0 || j == subNby - 2) fputs(" | ", outfile);
         if (j == subNby - 1) fputs("\n", outfile);
       }
@@ -3754,16 +3754,16 @@ void FixLbFluid::initializeGeometry()
   if (!outfile)
     error->one(FLERR, " file {} could not be opened: {}", datfile, utils::getsyserror());
 
-  fmt::print("\nme: {}\n", me);
+  utils::print("\nme: {}\n", me);
   for (i = 0; i < subNbx; i++) {
-    fmt::print("i={}\n", i);
+    utils::print("i={}\n", i);
     for (k = subNbz - 1; k > -1; k--) {
       if (k == subNbz - 2 || k == 0) {
         for (j = 0; j < subNby + 2; j++) fputs("---", outfile);
         fputs("\bn", outfile);
       }
       for (j = 0; j < subNby; j++) {
-        fmt::print(outfile, " {} ", sublattice[i][j][k].orientation);
+        utils::print(outfile, " {} ", sublattice[i][j][k].orientation);
         if (j == 0 || j == subNby - 2) fputs(" | ", outfile);
         if (j == subNby - 1) fputs("\n", outfile);
       }
diff --git a/src/LEPTON/fix_efield_lepton.cpp b/src/LEPTON/fix_efield_lepton.cpp
index a055c2b15a..a45305aee3 100644
--- a/src/LEPTON/fix_efield_lepton.cpp
+++ b/src/LEPTON/fix_efield_lepton.cpp
@@ -45,7 +45,8 @@ FixEfieldLepton::FixEfieldLepton(LAMMPS *lmp, int narg, char **arg) :
     Fix(lmp, narg, arg), idregion(nullptr), region(nullptr)
 {
   if (domain->xperiodic || domain->yperiodic || domain->zperiodic) {
-    error->warning(FLERR, "Fix {} uses unwrapped coordinates", style);
+    if (comm->me == 0)
+      error->warning(FLERR, "Fix {} uses unwrapped coordinates", style);
   }
   if (narg < 4) utils::missing_cmd_args(FLERR, std::string("fix ") + style, error);
 
@@ -57,6 +58,9 @@ FixEfieldLepton::FixEfieldLepton(LAMMPS *lmp, int narg, char **arg) :
   respa_level_support = 1;
   ilevel_respa = 0;
 
+  qe2f = force->qe2f;
+  mue2e = qe2f;
+
   // optional args
   int iarg = 4;
   while (iarg < narg) {
@@ -65,12 +69,13 @@ FixEfieldLepton::FixEfieldLepton(LAMMPS *lmp, int narg, char **arg) :
         utils::missing_cmd_args(FLERR, std::string("fix ") + style + " region", error);
       region = domain->get_region_by_id(arg[iarg + 1]);
       if (!region) error->all(FLERR, "Region {} for fix {} does not exist", arg[iarg + 1], style);
+      delete[] idregion;
       idregion = utils::strdup(arg[iarg + 1]);
       iarg += 2;
     } else if (strcmp(arg[iarg], "step") == 0) {
       if (iarg + 2 > narg)
         utils::missing_cmd_args(FLERR, std::string("fix ") + style + "step", error);
-      h = utils::numeric(FLERR, arg[iarg+1], false, lmp);
+      h = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
     } else {
       error->all(FLERR, "Unknown keyword for fix {} command: {}", style, arg[iarg]);
@@ -126,15 +131,15 @@ void FixEfieldLepton::init()
   }
 
   if (utils::strmatch(update->integrate_style, "^respa")) {
-    ilevel_respa = (dynamic_cast<Respa *>(update->integrate))->nlevels - 1;
+    auto respa = dynamic_cast<Respa *>(update->integrate);
+    if (respa) ilevel_respa = respa->nlevels - 1;
     if (respa_level >= 0) ilevel_respa = MIN(respa_level, ilevel_respa);
   }
 
-  // unit conversion factors and restrictions (see issue #1377)
+  // unit conversion restrictions (see issue #1377)
   char *unit_style = update->unit_style;
-  qe2f = force->qe2f;
-  mue2e = qe2f;
-  if (strcmp(unit_style, "electron") == 0 || strcmp(unit_style, "micro") == 0 || strcmp(unit_style, "nano") == 0) {
+  if (strcmp(unit_style, "electron") == 0 || strcmp(unit_style, "micro") == 0 ||
+      strcmp(unit_style, "nano") == 0) {
     error->all(FLERR, "Fix {} does not support {} units", style, unit_style);
   }
 }
@@ -145,9 +150,11 @@ void FixEfieldLepton::setup(int vflag)
 {
   if (utils::strmatch(update->integrate_style, "^respa")) {
     auto respa = dynamic_cast<Respa *>(update->integrate);
-    respa->copy_flevel_f(ilevel_respa);
-    post_force_respa(vflag, ilevel_respa, 0);
-    respa->copy_f_flevel(ilevel_respa);
+    if (respa) {
+      respa->copy_flevel_f(ilevel_respa);
+      post_force_respa(vflag, ilevel_respa, 0);
+      respa->copy_f_flevel(ilevel_respa);
+    }
   } else {
     post_force(vflag);
   }
@@ -179,14 +186,14 @@ void FixEfieldLepton::post_force(int vflag)
   auto dphi_x = parsed.differentiate("x").createCompiledExpression();
   auto dphi_y = parsed.differentiate("y").createCompiledExpression();
   auto dphi_z = parsed.differentiate("z").createCompiledExpression();
-  std::array<Lepton::CompiledExpression*, 3> dphis = {&dphi_x, &dphi_y, &dphi_z};
+  std::array<Lepton::CompiledExpression *, 3> dphis = {&dphi_x, &dphi_y, &dphi_z};
 
   // array of vectors of ptrs to Lepton variable references
   std::array<std::vector<double *>, 3> var_ref_ptrs{};
 
   // fill ptr-vectors with Lepton refs as needed
-  const char* DIM_NAMES[] = {"x", "y", "z"};
-  if (atom->q_flag){
+  const char *DIM_NAMES[] = {"x", "y", "z"};
+  if (atom->q_flag) {
     phi = parsed.createCompiledExpression();
     for (size_t d = 0; d < 3; d++) {
       try {
@@ -205,13 +212,13 @@ void FixEfieldLepton::post_force(int vflag)
         double *ptr = &((*dphis[j]).getVariableReference(DIM_NAMES[d]));
         var_ref_ptrs[d].push_back(ptr);
         e_uniform = false;
-      }
-      catch (Lepton::Exception &) {
+      } catch (Lepton::Exception &) {
         // do nothing
       }
     }
   if (!e_uniform && atom->mu_flag && h < 0) {
-    error->all(FLERR, "Fix {} requires keyword `step' for dipoles in a non-uniform electric field", style);
+    error->all(FLERR, "Fix {} requires keyword `step' for dipoles in a non-uniform electric field",
+               style);
   }
 
   // virial setup
@@ -228,7 +235,6 @@ void FixEfieldLepton::post_force(int vflag)
   double ex, ey, ez;
   double fx, fy, fz;
   double v[6], unwrap[3], dstep[3];
-  double xf, yf, zf, xb, yb, zb;
   double exf, eyf, ezf, exb, eyb, ezb;
   double mu_norm, h_mu;
 
@@ -244,9 +250,7 @@ void FixEfieldLepton::post_force(int vflag)
 
       // put unwrapped coords into Lepton variable refs
       for (size_t d = 0; d < 3; d++) {
-        for (auto & var_ref_ptr : var_ref_ptrs[d]) {
-          *var_ref_ptr = unwrap[d];
-        }
+        for (auto &var_ref_ptr : var_ref_ptrs[d]) { *var_ref_ptr = unwrap[d]; }
       }
 
       // evaluate e-field, used by q and mu
@@ -265,8 +269,8 @@ void FixEfieldLepton::post_force(int vflag)
       }
 
       if (atom->mu_flag) {
-      // dipoles
-        mu_norm = sqrt(mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2]);
+        // dipoles
+        mu_norm = sqrt(mu[i][0] * mu[i][0] + mu[i][1] * mu[i][1] + mu[i][2] * mu[i][2]);
         if (mu_norm > EPSILON) {
           // torque = mu cross E
           t[i][0] += mue2e * (ez * mu[i][1] - ey * mu[i][2]);
@@ -285,9 +289,7 @@ void FixEfieldLepton::post_force(int vflag)
 
             // one step forwards, two steps back ;)
             for (size_t d = 0; d < 3; d++) {
-              for (auto & var_ref_ptr : var_ref_ptrs[d]) {
-                *var_ref_ptr += dstep[d];
-              }
+              for (auto &var_ref_ptr : var_ref_ptrs[d]) { *var_ref_ptr += dstep[d]; }
             }
 
             exf = -dphi_x.evaluate();
@@ -295,9 +297,7 @@ void FixEfieldLepton::post_force(int vflag)
             ezf = -dphi_z.evaluate();
 
             for (size_t d = 0; d < 3; d++) {
-              for (auto & var_ref_ptr : var_ref_ptrs[d]) {
-                *var_ref_ptr -= 2*dstep[d];
-              }
+              for (auto &var_ref_ptr : var_ref_ptrs[d]) { *var_ref_ptr -= 2 * dstep[d]; }
             }
 
             exb = -dphi_x.evaluate();
diff --git a/src/MANYBODY/fix_qeq_comb.cpp b/src/MANYBODY/fix_qeq_comb.cpp
index 6edd2fbbb0..88c6fb4be4 100644
--- a/src/MANYBODY/fix_qeq_comb.cpp
+++ b/src/MANYBODY/fix_qeq_comb.cpp
@@ -203,7 +203,7 @@ void FixQEQComb::post_force(int /*vflag*/)
   // charge-equilibration loop
 
   if (me == 0 && fp)
-    fmt::print(fp,"Charge equilibration on step {}\n", update->ntimestep);
+    utils::print(fp,"Charge equilibration on step {}\n", update->ntimestep);
 
   heatpq = 0.05;
   qmass  = 0.016;
@@ -268,7 +268,7 @@ void FixQEQComb::post_force(int /*vflag*/)
     if (enegchk <= precision && enegmax <= 100.0*precision) break;
 
     if (me == 0 && fp)
-      fmt::print(fp,"  iteration: {}, enegtot {:.6g}, "
+      utils::print(fp,"  iteration: {}, enegtot {:.6g}, "
                  "enegmax {:.6g}, fq deviation: {:.6g}\n",
                  iloop,enegtot,enegmax,enegchk);
 
@@ -281,9 +281,9 @@ void FixQEQComb::post_force(int /*vflag*/)
 
   if (me == 0 && fp) {
     if (iloop == loopmax)
-      fmt::print(fp,"Charges did not converge in {} iterations\n",iloop);
+      utils::print(fp,"Charges did not converge in {} iterations\n",iloop);
     else
-      fmt::print(fp,"Charges converged in {} iterations to {:.10f} tolerance\n",
+      utils::print(fp,"Charges converged in {} iterations to {:.10f} tolerance\n",
                  iloop,enegchk);
   }
 }
diff --git a/src/MANYBODY/pair_comb3.cpp b/src/MANYBODY/pair_comb3.cpp
index b4228dbb4f..3a02ed73b1 100644
--- a/src/MANYBODY/pair_comb3.cpp
+++ b/src/MANYBODY/pair_comb3.cpp
@@ -164,7 +164,7 @@ void PairComb3::settings(int narg, char **arg)
   else error->all(FLERR,"Illegal pair_style command");
 
   if (comm->me == 0 && screen)
-    fmt::print(screen,"   PairComb3: polarization is {} \n",
+    utils::print(screen,"   PairComb3: polarization is {} \n",
                pol_flag ? "on" : "off");
 }
 
diff --git a/src/MEAM/meam_funcs.cpp b/src/MEAM/meam_funcs.cpp
index b08d8380b3..e769802b38 100644
--- a/src/MEAM/meam_funcs.cpp
+++ b/src/MEAM/meam_funcs.cpp
@@ -44,6 +44,7 @@ double MEAM::G_gam(const double gamma, const int ibar, int &errorflag) const
         //         e.g. gsmooth_factor is 99, {:
         //         gsmooth_switchpoint = -0.99
         //         G = 0.01*(-0.99/gamma)**99
+        if (gamma == 0.0) return 0.0; // avoid division by zero. For gamma = 0.0 => G = 1 / inf
         double G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor);
         return sqrt(G);
       } else {
diff --git a/src/MISC/fix_imd.cpp b/src/MISC/fix_imd.cpp
index 0dbafe49cb..04adcabd34 100644
--- a/src/MISC/fix_imd.cpp
+++ b/src/MISC/fix_imd.cpp
@@ -581,9 +581,8 @@ FixIMD::FixIMD(LAMMPS *lmp, int narg, char **arg) :
       msglen += 3*4*num_coords+IMDHEADERSIZE;
     }
     msgdata = new char[msglen];
-  }
-  else {
-    msglen = 3*sizeof(float)*num_coords+IMDHEADERSIZE;
+  } else {
+    msglen = 3*(int)sizeof(float)*num_coords+IMDHEADERSIZE;
     msgdata = new char[msglen];
   }
 
diff --git a/src/ML-POD/eapod.cpp b/src/ML-POD/eapod.cpp
index 11dace1f28..0525cfdb55 100644
--- a/src/ML-POD/eapod.cpp
+++ b/src/ML-POD/eapod.cpp
@@ -982,10 +982,14 @@ double EAPOD::peratom_environment_descriptors(double *cb, double *bd, double *tm
     D[j] = 1.0 / sum;
   }
 
-  double sum = 0;
+  double sum = 0.0;
   for (int j = 0; j < nClusters; j++) sum += D[j];
   double sumD = sum;
-  for (int j = 0; j < nClusters; j++) P[j] = D[j]/sum;
+  if (sum != 0.0) {
+    for (int j = 0; j < nClusters; j++) P[j] = D[j]/sum;
+  } else {
+    for (int j = 0; j < nClusters; j++) P[j] = 0.0;
+  }
 
   int nc = nCoeffPerElement*(ti[0]-1);
   double ei = coeff[0 + nc];
@@ -1008,13 +1012,13 @@ double EAPOD::peratom_environment_descriptors(double *cb, double *bd, double *tm
   }
 
   for (int m = 0; m<Mdesc; m++) {
-    double S1 = 1/sumD;
-    double S2 = sumD*sumD;
+    double S1 = 1.0/sumD;
+    double S2 = S1*S1;
     double sum = 0.0;
     for (int j=0; j<nClusters; j++) {
       double dP_dB = 0.0;
       for (int k = 0; k < nClusters; k++) {
-        double dP_dD = -D[j] / S2;
+        double dP_dD = -D[j] * S2;
         if (k==j) dP_dD += S1;
         double dD_dB = 0.0;
         double D2 = 2 * D[k] * D[k];
@@ -2844,11 +2848,11 @@ void EAPOD::peratomenvironment_descriptors(double *P, double *dP_dR, double *B,
   DGEMM(&chn, &chn, &nClusters, &Mdesc, &nComponents, &alpha, dD_dpca, &nClusters, ProjMat, &nComponents, &beta, dD_dB, &nClusters);
 
   // calculate dP_dD
-  double S1 = 1 / sumD;
-  double S2 = sumD * sumD;
+  double S1 = 1.0 / sumD;
+  double S2 = S1 * S1;
   for (int k = 0; k < nClusters; k++) {
     for (int j = 0; j < nClusters; j++) {
-      dP_dD[j + k * nClusters] = -D[j] / S2;
+      dP_dD[j + k * nClusters] = -D[j] * S2;
     }
   }
   for (int j = 0; j < nClusters; j++) {
diff --git a/src/ML-POD/fitpod_command.cpp b/src/ML-POD/fitpod_command.cpp
index fbe9ecd396..ea8e5e22d3 100644
--- a/src/ML-POD/fitpod_command.cpp
+++ b/src/ML-POD/fitpod_command.cpp
@@ -163,15 +163,15 @@ void FitPOD::command(int narg, char **arg)
           n2 = fastpodptr->nComponents * fastpodptr->nClusters * fastpodptr->nelements;
         }
 
-        fmt::print(fp, "model_coefficients: {} {} {}\n", nCoeffAll, n1, n2);
+        utils::print(fp, "model_coefficients: {} {} {}\n", nCoeffAll, n1, n2);
         for (int count = 0; count < nCoeffAll; count++) {
-          fmt::print(fp, "{:<10.{}f}\n", desc.c[count], traindata.precision);
+          utils::print(fp, "{:<10.{}f}\n", desc.c[count], traindata.precision);
         }
         for (int count = 0; count < n1; count++) {
-          fmt::print(fp, "{:<10.{}f}\n", fastpodptr->Proj[count], 14);
+          utils::print(fp, "{:<10.{}f}\n", fastpodptr->Proj[count], 14);
         }
         for (int count = 0; count < n2; count++) {
-          fmt::print(fp, "{:<10.{}f}\n", fastpodptr->Centroids[count], 14);
+          utils::print(fp, "{:<10.{}f}\n", fastpodptr->Centroids[count], 14);
         }
         fclose(fp);
       }
@@ -1751,7 +1751,7 @@ void FitPOD::print_analysis(const datastruct &data, double *outarray, double *er
                   data.training ? "Training" : "Test");
 
   utils::logmesg(lmp, mystr);
-  fmt::print(fp_errors, mystr);
+  utils::print(fp_errors, mystr);
 
   std::string sa(lm + 80, '-');
   sa += '\n';
@@ -1759,12 +1759,12 @@ void FitPOD::print_analysis(const datastruct &data, double *outarray, double *er
       " {:^{}} | # configs |  # atoms  | MAE energy  | RMSE energy | MAE force  | RMSE force\n",
       "File", lm);
   utils::logmesg(lmp, sa + sb + sa);
-  fmt::print(fp_errors, sa + sb + sa);
+  utils::print(fp_errors, sa + sb + sa);
 
   int ci = 0, m = 8, nc = 0, nf = 0;
   for (int file = 0; file < nfiles; file++) {
-    fmt::print(fp_analysis, "# {}\n", data.filenames[file]);
-    fmt::print(fp_analysis,
+    utils::print(fp_analysis, "# {}\n", data.filenames[file]);
+    utils::print(fp_analysis,
                "  config   # atoms       volume        energy        DFT energy     energy error   "
                "  force          DFT force       force error\n");
 
@@ -1772,14 +1772,14 @@ void FitPOD::print_analysis(const datastruct &data, double *outarray, double *er
     int nconfigs = data.num_config[file];
     nc += nconfigs;
     for (int ii = 0; ii < nconfigs; ii++) {    // loop over each configuration in a file
-      fmt::print(fp_analysis, "{:6}   {:8}    ", outarray[m * ci], outarray[1 + m * ci]);
+      utils::print(fp_analysis, "{:6}   {:8}    ", outarray[m * ci], outarray[1 + m * ci]);
 
       double vol = latticevolume(&data.lattice[9 * ci]);
-      fmt::print(fp_analysis, "{:<15.10} ", vol);
+      utils::print(fp_analysis, "{:<15.10} ", vol);
 
       for (int count = 2; count < m; count++)
-        fmt::print(fp_analysis, "{:<15.10} ", outarray[count + m * ci]);
-      fmt::print(fp_analysis, "\n");
+        utils::print(fp_analysis, "{:<15.10} ", outarray[count + m * ci]);
+      utils::print(fp_analysis, "\n");
 
       nforceall += 3 * data.num_atom[ci];
       ci += 1;
@@ -1792,23 +1792,23 @@ void FitPOD::print_analysis(const datastruct &data, double *outarray, double *er
                     data.filenames[file], lm, nconfigs, nforceall / 3, errors[0 + 4 * q],
                     errors[1 + 4 * q], errors[2 + 4 * q], errors[3 + 4 * q]);
     utils::logmesg(lmp, s);
-    fmt::print(fp_errors, s);
+    utils::print(fp_errors, s);
   }
   utils::logmesg(lmp, sa);
-  fmt::print(fp_errors, sa);
+  utils::print(fp_errors, sa);
 
   auto s =
       fmt::format("{:<{}} {:>10} {:>11}     {:<10.6f}    {:<10.6f}    {:<10.6f}    {:<10.6f}\n",
                   "All files", lm, nc, nf / 3, errors[0], errors[1], errors[2], errors[3]);
   utils::logmesg(lmp, s + sa);
-  fmt::print(fp_errors, "{}", s + sa);
+  utils::print(fp_errors, "{}", s + sa);
 
   mystr =
       fmt::format("**************** End of Error Analysis for the {} Data Set ****************\n",
                   data.training ? "Training" : "Test");
 
   utils::logmesg(lmp, mystr);
-  fmt::print(fp_errors, mystr);
+  utils::print(fp_errors, mystr);
 
   fclose(fp_errors);
   fclose(fp_analysis);
@@ -2254,14 +2254,14 @@ void FitPOD::savedata2textfile(std::string filename, std::string text, double *A
     int precision = 15;
     FILE *fp = fopen(filename.c_str(), "w");
     if (dim == 1) {
-      fmt::print(fp, text, n);
-      for (int i = 0; i < n; i++) fmt::print(fp, "{:<10.{}f} \n", A[i], precision);
+      utils::print(fp, text, n);
+      for (int i = 0; i < n; i++) utils::print(fp, "{:<10.{}f} \n", A[i], precision);
     } else if (dim == 2) {
-      fmt::print(fp, text, n);
-      fmt::print(fp, "{} \n", m);
+      utils::print(fp, text, n);
+      utils::print(fp, "{} \n", m);
       for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) fmt::print(fp, "{:<10.{}f}     ", A[j + i * n], precision);
-        fmt::print(fp, "   \n");
+        for (int i = 0; i < m; i++) utils::print(fp, "{:<10.{}f}     ", A[j + i * n], precision);
+        utils::print(fp, "   \n");
       }
     }
     fclose(fp);
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
new file mode 100644
index 0000000000..8a747a7908
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -0,0 +1,166 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using MathConst::MY_2PI;
+using MathSpecial::powint;
+
+ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr),
+    sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 4 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "gaussian/atom:radelem");    // offset by 1 to match up with types
+  memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem");
+  memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem");
+  memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
+
+  // construct cutsq
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  size_local_cols = size_local_cols_base + ntypes;
+
+  // pre-compute coefficients
+  for (int i = 0; i < ntypes; i++) {
+    prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
+    argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
+{
+  if (copymode) return;
+  memory->destroy(radelem);
+  memory->destroy(sigmaelem);
+  memory->destroy(prefacelem);
+  memory->destroy(argfacelem);
+  memory->destroy(cutsq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::init()
+{
+  if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute gaussian/grid/local");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::compute_local()
+{
+  invoked_local = update->ntimestep;
+
+  // compute gaussian for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        grid2x(ix, iy, iz, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // Zeroing out the components, which are filled as a sum.
+        for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+          alocal[igrid][icol] = 0.0;
+        }
+
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          if (rsq < cutsq[jtype][jtype]) {
+            int icol = size_local_cols_base + jtype - 1;
+            alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
+          }
+        }
+        igrid++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeGaussianGridLocal::memory_usage()
+{
+  int n = atom->ntypes + 1;
+  int nbytes = (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h
new file mode 100644
index 0000000000..77f88a7a8e
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@@ -0,0 +1,51 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+
+#include "compute_grid_local.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGaussianGridLocal : public ComputeGridLocal {
+ public:
+  ComputeGaussianGridLocal(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocal() override;
+  void init() override;
+  void compute_local() override;
+  double memory_usage() override;
+
+ protected:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;     // global cut-off scale
+  double *radelem;    // cut-off radius of each atom type
+  double *sigmaelem;  // Gaussian width of each atom type
+  double *prefacelem; // Gaussian prefactor of each atom type
+  double *argfacelem; // Gaussian argument factor of each atom type
+  int *map;    // map types to [0,nelements)
+  int nelements;
+  double cutmax;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp
index 2179bb8ebd..12135c705d 100644
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGrid::~ComputeGrid()
 {
+  if (copymode) return;
   deallocate();
 }
 
@@ -111,7 +112,6 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
   // allocate arrays
-
   memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
   memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 0f275a9aae..80feb75be5 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -119,6 +119,8 @@ void ComputeGridLocal::allocate()
 
 void ComputeGridLocal::deallocate()
 {
+  if (copymode) return;
+
   if (gridlocal_allocated) {
     gridlocal_allocated = 0;
     memory->destroy(alocal);
diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp
index 4243202545..95c3fa70a8 100644
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   // skip over arguments used by base class
   // so that argument positions are identical to
   // regular per-atom compute
-
   arg += nargbase;
   narg -= nargbase;
 
   // begin code common to all SNAP computes
 
-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
 
   int ntypes = atom->ntypes;
   int nargmin = 6 + 2 * ntypes;
@@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   wselfallflag = 0;
   switchinnerflag = 0;
   nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;
 
   // process required arguments
 
@@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
 
   for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
-  for (int i = 0; i < ntypes; i++)
+  for (int i = 0; i < ntypes; i++) {
     wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+  }
 
   // construct cutsq
 
@@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeSNAGrid::~ComputeSNAGrid()
 {
+  if (copymode) return;
+
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
   delete snaptr;
-
   if (chemflag) memory->destroy(map);
 }
 
@@ -202,6 +205,7 @@ void ComputeSNAGrid::init()
 
 void ComputeSNAGrid::compute_array()
 {
+
   invoked_array = update->ntimestep;
 
   // compute sna for each gridpoint
diff --git a/src/ML-SNAP/compute_sna_grid.h b/src/ML-SNAP/compute_sna_grid.h
index 3a5a373826..a158c2342f 100644
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
   void init() override;
   void compute_array() override;
   double memory_usage() override;
+  int ncoeff,nelements; // public for kokkos, but could go in the protected block now
 
- private:
-  int ncoeff;
+ protected:
+  //int ncoeff;
   double **cutsq;
   double rcutfac;
   double *radelem;
   double *wjelem;
   int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
   int switchinnerflag;
   double *sinnerelem;
   double *dinnerelem;
+  int parallel_thresh;
   class SNA *snaptr;
   double cutmax;
   int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp
index 1d42a42c05..db49063920 100644
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
   // begin code common to all SNAP computes
 
-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
 
   int ntypes = atom->ntypes;
   int nargmin = 6 + 2 * ntypes;
@@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
   wselfallflag = 0;
   switchinnerflag = 0;
   nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;
 
   // process required arguments
 
@@ -180,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeSNAGridLocal::~ComputeSNAGridLocal()
 {
+  if (copymode) return;
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
diff --git a/src/ML-SNAP/compute_sna_grid_local.h b/src/ML-SNAP/compute_sna_grid_local.h
index 0475212e13..85662ad509 100644
--- a/src/ML-SNAP/compute_sna_grid_local.h
+++ b/src/ML-SNAP/compute_sna_grid_local.h
@@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
   void compute_local() override;
   double memory_usage() override;
 
- private:
+ protected:
   int ncoeff;
   double **cutsq;
   double rcutfac;
@@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
   class SNA *snaptr;
   double cutmax;
   int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+  int parallel_thresh;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/MOLECULE/fix_cmap.cpp b/src/MOLECULE/fix_cmap.cpp
index 02116965b5..29fa8e4072 100644
--- a/src/MOLECULE/fix_cmap.cpp
+++ b/src/MOLECULE/fix_cmap.cpp
@@ -1051,7 +1051,7 @@ bigint FixCMAP::read_data_skip_lines(char * /*keyword*/)
 
 void FixCMAP::write_data_header(FILE *fp, int /*mth*/)
 {
-  fmt::print(fp,"{} crossterms\n",ncmap);
+  utils::print(fp,"{} crossterms\n",ncmap);
 }
 
 /* ----------------------------------------------------------------------
@@ -1129,7 +1129,7 @@ void FixCMAP::write_data_section(int /*mth*/, FILE *fp,
                                   int n, double **buf, int index)
 {
   for (int i = 0; i < n; i++)
-    fmt::print(fp,"{} {} {} {} {} {} {}\n",
+    utils::print(fp,"{} {} {} {} {} {} {}\n",
                index+i,ubuf(buf[i][0]).i, ubuf(buf[i][1]).i, ubuf(buf[i][2]).i,
                ubuf(buf[i][3]).i,ubuf(buf[i][4]).i,ubuf(buf[i][5]).i);
 }
diff --git a/src/Makefile b/src/Makefile
index 4d8b02458a..3de8eb85d5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -480,7 +480,7 @@ tar:
 	@cd STUBS; $(MAKE)
 	@echo "Created $(ROOT)_src.tar.gz"
 
-check: check-whitespace check-permissions check-homepage check-errordocs check-docs check-version
+check: check-whitespace check-permissions check-homepage check-errordocs check-fmtlib check-docs check-version
 
 check-whitespace:
 	$(PYTHON) ../tools/coding_standard/whitespace.py ..
@@ -506,6 +506,12 @@ check-errordocs:
 fix-errordocs:
 	$(PYTHON) ../tools/coding_standard/errordocs.py .. -f
 
+check-fmtlib:
+	$(PYTHON) ../tools/coding_standard/fmtlib.py ..
+
+fix-fmtlib:
+	$(PYTHON) ../tools/coding_standard/fmtlib.py .. -f
+
 check-docs:
 	$(MAKE) $(MFLAGS) -C ../doc anchor_check style_check package_check role_check
 
diff --git a/src/OPENMP/fix_qeq_comb_omp.cpp b/src/OPENMP/fix_qeq_comb_omp.cpp
index a2fcf85367..8d5eb284cc 100644
--- a/src/OPENMP/fix_qeq_comb_omp.cpp
+++ b/src/OPENMP/fix_qeq_comb_omp.cpp
@@ -99,7 +99,7 @@ void FixQEQCombOMP::post_force(int /* vflag */)
 
   // charge-equilibration loop
 
-  if (me == 0 && fp) fmt::print(fp, "Charge equilibration on step {}\n", update->ntimestep);
+  if (me == 0 && fp) utils::print(fp, "Charge equilibration on step {}\n", update->ntimestep);
 
   heatpq = 0.05;
   qmass = 0.016;
diff --git a/src/PHONON/dynamical_matrix.cpp b/src/PHONON/dynamical_matrix.cpp
index c2ba34c4c2..a0199747c4 100644
--- a/src/PHONON/dynamical_matrix.cpp
+++ b/src/PHONON/dynamical_matrix.cpp
@@ -282,7 +282,7 @@ void DynamicalMatrix::calculateMatrix()
 
   if (me == 0 && screen) {
     fputs("Calculating Dynamical Matrix ...\n", screen);
-    fmt::print(screen,"  Total # of atoms = {}\n"
+    utils::print(screen,"  Total # of atoms = {}\n"
                       "  Atoms in group = {}\n"
                       "  Total dynamical matrix elements = {}\n",
                natoms, gcount, dynlen*dynlen);
diff --git a/src/PHONON/fix_phonon.cpp b/src/PHONON/fix_phonon.cpp
index 74670ebde5..37f84b6f12 100644
--- a/src/PHONON/fix_phonon.cpp
+++ b/src/PHONON/fix_phonon.cpp
@@ -186,21 +186,21 @@ FixPhonon::FixPhonon(LAMMPS *lmp,  int narg, char **arg) : Fix(lmp, narg, arg)
     flog = fopen(logfile, "w");
     if (flog == nullptr)
       error->one(FLERR,"Can not open output file {}: {}", logfile,utils::getsyserror());
-    fmt::print(flog,"############################################################\n");
-    fmt::print(flog,"# group name of the atoms under study      : {}\n", group->names[igroup]);
-    fmt::print(flog,"# total number of atoms in the group       : {}\n", ngroup);
-    fmt::print(flog,"# dimension of the system                  : {} D\n", sysdim);
-    fmt::print(flog,"# number of atoms per unit cell            : {}\n", nucell);
-    fmt::print(flog,"# dimension of the FFT mesh                : {} x {} x {}\n", nx, ny, nz);
-    fmt::print(flog,"# number of wait steps before measurement  : {}\n", waitsteps);
-    fmt::print(flog,"# frequency of the measurement             : {}\n", nevery);
-    fmt::print(flog,"# output result after this many measurement: {}\n", nfreq);
-    fmt::print(flog,"# number of processors used by this run    : {}\n", nprocs);
-    fmt::print(flog,"############################################################\n");
-    fmt::print(flog,"# mapping information between lattice indices and atom id\n");
-    fmt::print(flog,"# nx ny nz nucell\n");
-    fmt::print(flog,"{} {} {} {}\n", nx, ny, nz, nucell);
-    fmt::print(flog,"# l1 l2 l3 k atom_id\n");
+    utils::print(flog,"############################################################\n");
+    utils::print(flog,"# group name of the atoms under study      : {}\n", group->names[igroup]);
+    utils::print(flog,"# total number of atoms in the group       : {}\n", ngroup);
+    utils::print(flog,"# dimension of the system                  : {} D\n", sysdim);
+    utils::print(flog,"# number of atoms per unit cell            : {}\n", nucell);
+    utils::print(flog,"# dimension of the FFT mesh                : {} x {} x {}\n", nx, ny, nz);
+    utils::print(flog,"# number of wait steps before measurement  : {}\n", waitsteps);
+    utils::print(flog,"# frequency of the measurement             : {}\n", nevery);
+    utils::print(flog,"# output result after this many measurement: {}\n", nfreq);
+    utils::print(flog,"# number of processors used by this run    : {}\n", nprocs);
+    utils::print(flog,"############################################################\n");
+    utils::print(flog,"# mapping information between lattice indices and atom id\n");
+    utils::print(flog,"# nx ny nz nucell\n");
+    utils::print(flog,"{} {} {} {}\n", nx, ny, nz, nucell);
+    utils::print(flog,"# l1 l2 l3 k atom_id\n");
     int ix, iy, iz, iu;
     for (idx = 0; idx < ngroup; ++idx) {
       itag = surf2tag[idx];
@@ -208,9 +208,9 @@ FixPhonon::FixPhonon(LAMMPS *lmp,  int narg, char **arg) : Fix(lmp, narg, arg)
       iz   = (idx/nucell)%nz;
       iy   = (idx/(nucell*nz))%ny;
       ix   = (idx/(nucell*nz*ny))%nx;
-      fmt::print(flog,"{} {} {} {} {}\n", ix, iy, iz, iu, itag);
+      utils::print(flog,"{} {} {} {} {}\n", ix, iy, iz, iu, itag);
     }
-    fmt::print(flog,"############################################################\n");
+    utils::print(flog,"############################################################\n");
     fflush(flog);
   }
   surf2tag.clear();
@@ -737,16 +737,16 @@ void FixPhonon::postprocess( )
     fclose(fp_bin);
 
     // write log file, here however, it is the dynamical matrix that is written
-    fmt::print(flog,"############################################################\n");
-    fmt::print(flog,"# Current time step                      : {}\n", update->ntimestep);
-    fmt::print(flog,"# Total number of measurements           : {}\n", neval);
-    fmt::print(flog,"# Average temperature of the measurement : {}\n", TempAve);
-    fmt::print(flog,"# Boltzmann constant under current units : {}\n", boltz);
-    fmt::print(flog,"# basis vector A1 = [{} {} {}]\n", basevec[0], basevec[1], basevec[2]);
-    fmt::print(flog,"# basis vector A2 = [{} {} {}]\n", basevec[3], basevec[4], basevec[5]);
-    fmt::print(flog,"# basis vector A3 = [{} {} {}]\n", basevec[6], basevec[7], basevec[8]);
-    fmt::print(flog,"############################################################\n");
-    fmt::print(flog,"# qx\t qy \t qz \t\t Phi(q)\n");
+    utils::print(flog,"############################################################\n");
+    utils::print(flog,"# Current time step                      : {}\n", update->ntimestep);
+    utils::print(flog,"# Total number of measurements           : {}\n", neval);
+    utils::print(flog,"# Average temperature of the measurement : {}\n", TempAve);
+    utils::print(flog,"# Boltzmann constant under current units : {}\n", boltz);
+    utils::print(flog,"# basis vector A1 = [{} {} {}]\n", basevec[0], basevec[1], basevec[2]);
+    utils::print(flog,"# basis vector A2 = [{} {} {}]\n", basevec[3], basevec[4], basevec[5]);
+    utils::print(flog,"# basis vector A3 = [{} {} {}]\n", basevec[6], basevec[7], basevec[8]);
+    utils::print(flog,"############################################################\n");
+    utils::print(flog,"# qx\t qy \t qz \t\t Phi(q)\n");
 
     EnforceASR();
 
@@ -765,10 +765,10 @@ void FixPhonon::postprocess( )
         double qy = double(iy)/double(ny);
         for (int iz = 0; iz < nz; ++iz) {
           double qz = double(iz)/double(nz);
-          fmt::print(flog,"{} {} {}", qx, qy, qz);
+          utils::print(flog,"{} {} {}", qx, qy, qz);
           for (idim = 0; idim < fft_dim2; ++idim)
-            fmt::print(flog, " {} {}", std::real(Phi_all[idq][idim]), std::imag(Phi_all[idq][idim]));
-          fmt::print(flog, "\n");
+            utils::print(flog, " {} {}", std::real(Phi_all[idq][idim]), std::imag(Phi_all[idq][idim]));
+          utils::print(flog, "\n");
           ++idq;
         }
       }
diff --git a/src/PHONON/third_order.cpp b/src/PHONON/third_order.cpp
index c31aae0086..378905bd29 100644
--- a/src/PHONON/third_order.cpp
+++ b/src/PHONON/third_order.cpp
@@ -295,7 +295,7 @@ void ThirdOrder::calculateMatrix()
 
   if (comm->me == 0 && screen) {
     fputs("Calculating Third Order ...\n", screen);
-    fmt::print(screen,"  Total # of atoms = {}\n"
+    utils::print(screen,"  Total # of atoms = {}\n"
                       "  Atoms in group = {}\n"
                       "  Total third order elements = {}\n",
                       natoms, gcount, dynlen*dynlen*dynlen);
@@ -432,7 +432,7 @@ void ThirdOrder::writeMatrix(double *dynmat, bigint i, int a, bigint j, int b)
       for (int k = 0; k < atom->natoms; k++){
         norm = square(dynmat[k*3])+square(dynmat[k*3+1])+square(dynmat[k*3+2]);
         if (norm > 1.0e-16)
-          fmt::print(fp, "{} {} {} {} {} {:17.8f} {:17.8f} {:17.8f}\n",
+          utils::print(fp, "{} {} {} {} {} {:17.8f} {:17.8f} {:17.8f}\n",
                      i+1, a+1, j+1, b+1, k+1, dynmat[k*3] * conversion,
                      dynmat[k*3+1] * conversion, dynmat[k*3+2] * conversion);
       }
@@ -440,7 +440,7 @@ void ThirdOrder::writeMatrix(double *dynmat, bigint i, int a, bigint j, int b)
       for (int k = 0; k < gcount; k++){
         norm = square(dynmat[k*3])+square(dynmat[k*3+1])+square(dynmat[k*3+2]);
         if (norm > 1.0e-16)
-          fmt::print(fp, "{} {} {} {} {} {:17.8f} {:17.8f} {:17.8f}\n",
+          utils::print(fp, "{} {} {} {} {} {:17.8f} {:17.8f} {:17.8f}\n",
                      i+1, a+1, j+1, b+1, groupmap[k]+1, dynmat[k*3] * conversion,
                      dynmat[k*3+1] * conversion, dynmat[k*3+2] * conversion);
       }
diff --git a/src/Purge.list b/src/Purge.list
index 7098d39e3a..2b949d694d 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -53,6 +53,8 @@ lmpinstalledpkgs.h
 lmpgitversion.h
 mliap_model_python_couple.cpp
 mliap_model_python_couple.h
+# removed in Dec 2024
+group_kokkos.cpp
 # renamed in September 2024
 group_ndx.cpp
 group_ndx.h
diff --git a/src/QTB/fix_qbmsst.cpp b/src/QTB/fix_qbmsst.cpp
index 7dd7547efd..883e65b09e 100644
--- a/src/QTB/fix_qbmsst.cpp
+++ b/src/QTB/fix_qbmsst.cpp
@@ -360,7 +360,7 @@ void FixQBMSST::init()
     h_timestep=alpha*dtv;
   }
   if (comm->me == 0 && screen)
-    fmt::print(screen,"The effective maximum frequency is now {} inverse time unit "
+    utils::print(screen,"The effective maximum frequency is now {} inverse time unit "
                "with alpha value as {}!\n", 0.5/h_timestep, alpha);
 
   //gfactor is the random force \sqrt{\frac{2\gamma{}m_{i}}{\alpha*\delta{}t}}, \sqrt{12} makes the random array variance equal to unit.
diff --git a/src/QTB/fix_qtb.cpp b/src/QTB/fix_qtb.cpp
index 8f73a04927..4324238b97 100644
--- a/src/QTB/fix_qtb.cpp
+++ b/src/QTB/fix_qtb.cpp
@@ -170,7 +170,7 @@ void FixQTB::init()
     h_timestep=alpha*dtv;
   }
   if (comm->me == 0 && screen)
-    fmt::print(screen,"The effective maximum frequency is now {} inverse time unit "
+    utils::print(screen,"The effective maximum frequency is now {} inverse time unit "
                "with alpha value as {}!\n", 0.5/h_timestep, alpha);
 
   // set force prefactors
diff --git a/src/REAXFF/fix_reaxff_bonds.cpp b/src/REAXFF/fix_reaxff_bonds.cpp
index a5ce478c1d..54ad593486 100644
--- a/src/REAXFF/fix_reaxff_bonds.cpp
+++ b/src/REAXFF/fix_reaxff_bonds.cpp
@@ -253,11 +253,11 @@ void FixReaxFFBonds::RecvBuffer(double *buf, int nbuf, int nbuf_local,
   MPI_Request irequest, irequest2;
 
   if (me == 0) {
-    fmt::print(fp,"# Timestep {}\n#\n",ntimestep);
-    fmt::print(fp,"# Number of particles {}\n#\n",natoms);
-    fmt::print(fp,"# Max number of bonds per atom {} with coarse bond order cutoff {:5.3f}\n",
+    utils::print(fp,"# Timestep {}\n#\n",ntimestep);
+    utils::print(fp,"# Number of particles {}\n#\n",natoms);
+    utils::print(fp,"# Max number of bonds per atom {} with coarse bond order cutoff {:5.3f}\n",
                maxnum,cutof3);
-    fmt::print(fp,"# Particle connection table and bond orders\n"
+    utils::print(fp,"# Particle connection table and bond orders\n"
                "# id type nb id_1...id_nb mol bo_1...bo_nb abo nlp q\n");
   }
 
@@ -292,7 +292,7 @@ void FixReaxFFBonds::RecvBuffer(double *buf, int nbuf, int nbuf_local,
         j += (1+numbonds);
 
         mesg += fmt::format("{:14.3f}{:14.3f}{:14.3f}\n",sbotmp,nlptmp,avqtmp);
-        fmt::print(fp, mesg);
+        utils::print(fp, mesg);
       }
     }
   } else {
diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index c6da55fd25..1916597a69 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -204,6 +204,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
       delete[] filedel;
       filedel = utils::strdup(arg[iarg + 1]);
       if (comm->me == 0) {
+        if (fdel) fclose(fdel);
         fdel = fopen(filedel, "w");
         if (!fdel)
           error->one(FLERR, "Cannot open fix reaxff/species delete file {}: {}", filedel,
@@ -793,12 +794,12 @@ void FixReaxFFSpecies::WriteFormulas(int Nmole, int Nspec)
         if (itemp != 1) molname += std::to_string(itemp);
       }
     }
-    fmt::print(fp, " {:>11}", molname);
+    utils::print(fp, " {:>11}", molname);
   }
   fputs("\n", fp);
 
-  fmt::print(fp, "{:>11} {:>11} {:>11}", ntimestep, Nmole, Nspec);
-  for (i = 0; i < Nmoltype; i++) fmt::print(fp, " {:>11}", NMol[i]);
+  utils::print(fp, "{:>11} {:>11} {:>11}", ntimestep, Nmole, Nspec);
+  for (i = 0; i < Nmoltype; i++) utils::print(fp, " {:>11}", NMol[i]);
   fputs("\n", fp);
 }
 
@@ -837,7 +838,7 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec)
   for (int j = 0; j < 3; j++) halfbox[j] = box[j] / 2;
 
   if (comm->me == 0) {
-    fmt::print(pos,
+    utils::print(pos,
                "Timestep {} NMole {}  NSpec {}  xlo {:f}  "
                "xhi {:f}  ylo {:f}  yhi {:f}  zlo {:f}  zhi {:f}\n",
                update->ntimestep, Nmole, Nspec, domain->boxlo[0], domain->boxhi[0],
@@ -1061,7 +1062,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
       for (int m = 0; m < Nspec; m++) {
         if (deletecount[m] > 0) {
           if (printflag == 0) {
-            fmt::print(fdel, "Timestep {}", update->ntimestep);
+            utils::print(fdel, "Timestep {}", update->ntimestep);
             printflag = 1;
           }
           fprintf(fdel, " %g ", deletecount[m]);
@@ -1084,7 +1085,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
         if (deletecount[i]) writeflag = 1;
 
       if (writeflag) {
-        fmt::print(fdel, "{}", update->ntimestep);
+        utils::print(fdel, "{}", update->ntimestep);
         for (i = 0; i < ndelspec; i++) { fprintf(fdel, "\t%g", deletecount[i]); }
         fprintf(fdel, "\n");
         fflush(fdel);
diff --git a/src/REAXFF/reaxff_control.cpp b/src/REAXFF/reaxff_control.cpp
index 99e498b428..2d700e1eec 100644
--- a/src/REAXFF/reaxff_control.cpp
+++ b/src/REAXFF/reaxff_control.cpp
@@ -57,8 +57,8 @@ namespace ReaxFF {
   class control_parser_error : public std::exception {
     std::string message;
   public:
-    explicit control_parser_error(const std::string &format, const std::string &keyword) {
-      message = fmt::format(format, keyword);
+    explicit control_parser_error(const std::string &msg) {
+      message = msg;
     }
     const char *what() const noexcept override { return message.c_str(); }
   };
@@ -92,11 +92,11 @@ namespace ReaxFF {
         auto keyword = values.next_string();
 
         if (!values.has_next())
-          throw control_parser_error("No value(s) for control parameter: {}\n", keyword);
+          throw control_parser_error(
+            fmt::format("No value(s) for control parameter: {}\n", keyword));
 
         if (inactive_keywords.find(keyword) != inactive_keywords.end()) {
-          error->warning(FLERR,fmt::format("Ignoring inactive control "
-                                           "parameter: {}",keyword));
+          error->warning(FLERR,fmt::format("Ignoring inactive control parameter: {}", keyword));
         } else if (keyword == "nbrhood_cutoff") {
           control->bond_cut = values.next_double();
         } else if (keyword == "bond_graph_cutoff") {
@@ -114,7 +114,7 @@ namespace ReaxFF {
             error->warning(FLERR,"Support for writing native trajectories has "
                            "been removed after LAMMPS version 8 April 2021");
         } else {
-          throw control_parser_error("Unknown parameter {} in control file", keyword);
+          throw control_parser_error(fmt::format("Unknown parameter {} in control file", keyword));
         }
       }
     } catch (LAMMPS_NS::EOFException &) {
diff --git a/src/REAXFF/reaxff_init_md.cpp b/src/REAXFF/reaxff_init_md.cpp
index 2d0459691f..6ede21e4ca 100644
--- a/src/REAXFF/reaxff_init_md.cpp
+++ b/src/REAXFF/reaxff_init_md.cpp
@@ -80,15 +80,14 @@ namespace ReaxFF {
     swa = control->nonb_low;
     swb = control->nonb_cut;
 
-    if (fabs(swa) > 0.01 && control->me == 0)
+    if ((fabs(swa) > 0.01) && (control->me == 0))
       error->warning(FLERR, "Non-zero lower Taper-radius cutoff");
 
-    if (swb < 0) {
+    if (swb < 0.0) {
       error->all(FLERR,"Negative upper Taper-radius cutoff");
-    }
-    else if (swb < 5 && control->me == 0)
-      error->warning(FLERR,fmt::format("Warning: very low Taper-radius cutoff: "
-                                       "{}\n", swb));
+    } else if ((swb < 5.0) && (control->me == 0))
+      error->warning(FLERR,fmt::format("Very low Taper-radius cutoff: {}\n", swb));
+
     d1 = swb - swa;
     d7 = pow(d1, 7.0);
     swa2 = SQR(swa);
diff --git a/src/REPLICA/fix_alchemy.cpp b/src/REPLICA/fix_alchemy.cpp
index f3b79d0956..69a2c3f36a 100644
--- a/src/REPLICA/fix_alchemy.cpp
+++ b/src/REPLICA/fix_alchemy.cpp
@@ -242,8 +242,8 @@ void FixAlchemy::setup(int vflag)
   if (universe->me == 0) {
     progress = 0;
     auto msg = fmt::format("Starting alchemical run\n");
-    if (universe->uscreen) fmt::print(universe->uscreen, msg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, msg);
+    if (universe->uscreen) utils::print(universe->uscreen, msg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, msg);
   }
 
   // recheck domain decomposition, atom ordering, and synchronize positions
@@ -325,8 +325,8 @@ void FixAlchemy::post_force(int /*vflag*/)
     if ((status / 10) > (progress / 10)) {
       progress = status;
       auto msg = fmt::format("  Alchemical run progress: {:>3d}%\n", progress);
-      if (universe->uscreen) fmt::print(universe->uscreen, msg);
-      if (universe->ulogfile) fmt::print(universe->ulogfile, msg);
+      if (universe->uscreen) utils::print(universe->uscreen, msg);
+      if (universe->ulogfile) utils::print(universe->ulogfile, msg);
     }
   }
 }
diff --git a/src/REPLICA/neb.cpp b/src/REPLICA/neb.cpp
index b14748565b..4b242965c8 100644
--- a/src/REPLICA/neb.cpp
+++ b/src/REPLICA/neb.cpp
@@ -241,19 +241,19 @@ void NEB::run()
 
   if (me_universe == 0) {
     if (uscreen) {
-      fmt::print(uscreen, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
+      utils::print(uscreen, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
                  "MaxReplicaForce", "MaxAtomForce", "GradV0", "GradV1", "GradVc", "EBF", "EBR",
                  "RDT");
 
       if (print_mode != TERSE) {
         for (int i = 1; i <= nreplica; ++i)
-          fmt::print(uscreen, "{:^14} {:^14} ", "RD" + std::to_string(i), "PE" + std::to_string(i));
+          utils::print(uscreen, "{:^14} {:^14} ", "RD" + std::to_string(i), "PE" + std::to_string(i));
       }
 
       if (print_mode == VERBOSE) {
         for (int i = 1; i <= nreplica; ++i) {
           auto idx = std::to_string(i);
-          fmt::print(uscreen, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
+          utils::print(uscreen, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
                      "angletangrad" + idx, "anglegrad" + idx, "gradV" + idx, "RepForce" + idx,
                      "MaxAtomForce" + idx);
         }
@@ -262,20 +262,20 @@ void NEB::run()
     }
 
     if (ulogfile) {
-      fmt::print(ulogfile, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
+      utils::print(ulogfile, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
                  "MaxReplicaForce", "MaxAtomForce", "GradV0", "GradV1", "GradVc", "EBF", "EBR",
                  "RDT");
 
       if (print_mode != TERSE) {
         for (int i = 1; i <= nreplica; ++i)
-          fmt::print(ulogfile, "{:^14} {:^14} ", "RD" + std::to_string(i),
+          utils::print(ulogfile, "{:^14} {:^14} ", "RD" + std::to_string(i),
                      "PE" + std::to_string(i));
       }
 
       if (print_mode == VERBOSE) {
         for (int i = 1; i <= nreplica; ++i) {
           auto idx = std::to_string(i);
-          fmt::print(ulogfile, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
+          utils::print(ulogfile, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
                      "angletangrad" + idx, "anglegrad" + idx, "gradV" + idx, "RepForce" + idx,
                      "MaxAtomForce" + idx);
         }
@@ -340,19 +340,19 @@ void NEB::run()
 
   if (me_universe == 0) {
     if (uscreen) {
-      fmt::print(uscreen, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
+      utils::print(uscreen, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
                  "MaxReplicaForce", "MaxAtomForce", "GradV0", "GradV1", "GradVc", "EBF", "EBR",
                  "RDT");
 
       if (print_mode != TERSE) {
         for (int i = 1; i <= nreplica; ++i)
-          fmt::print(uscreen, "{:^14} {:^14} ", "RD" + std::to_string(i), "PE" + std::to_string(i));
+          utils::print(uscreen, "{:^14} {:^14} ", "RD" + std::to_string(i), "PE" + std::to_string(i));
       }
 
       if (print_mode == VERBOSE) {
         for (int i = 1; i <= nreplica; ++i) {
           auto idx = std::to_string(i);
-          fmt::print(uscreen, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
+          utils::print(uscreen, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
                      "angletangrad" + idx, "anglegrad" + idx, "gradV" + idx, "RepForce" + idx,
                      "MaxAtomForce" + idx);
         }
@@ -361,20 +361,20 @@ void NEB::run()
     }
 
     if (ulogfile) {
-      fmt::print(ulogfile, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
+      utils::print(ulogfile, "    Step     {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} {:^14} ",
                  "MaxReplicaForce", "MaxAtomForce", "GradV0", "GradV1", "GradVc", "EBF", "EBR",
                  "RDT");
 
       if (print_mode != TERSE) {
         for (int i = 1; i <= nreplica; ++i)
-          fmt::print(ulogfile, "{:^14} {:^14} ", "RD" + std::to_string(i),
+          utils::print(ulogfile, "{:^14} {:^14} ", "RD" + std::to_string(i),
                      "PE" + std::to_string(i));
       }
 
       if (print_mode == VERBOSE) {
         for (int i = 1; i <= nreplica; ++i) {
           auto idx = std::to_string(i);
-          fmt::print(ulogfile, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
+          utils::print(ulogfile, "{:^12}{:^12}{:^12} {:^12} {:^12}{:^12} ", "pathangle" + idx,
                      "angletangrad" + idx, "anglegrad" + idx, "gradV" + idx, "RepForce" + idx,
                      "MaxAtomForce" + idx);
         }
diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index 252f74a4e3..91150d8a38 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -417,8 +417,8 @@ void PRD::command(int narg, char **arg)
   if (me_universe == 0) {
     auto mesg = fmt::format("Loop time of {} on {} procs for {} steps with {} atoms\n",
                             timer->get_wall(Timer::TOTAL), nprocs_universe, nsteps,atom->natoms);
-    if (universe->uscreen) fmt::print(universe->uscreen, mesg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, mesg);
+    if (universe->uscreen) utils::print(universe->uscreen, mesg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, mesg);
   }
 
   if (me == 0) utils::logmesg(lmp,"\nPRD done\n");
@@ -725,8 +725,8 @@ void PRD::log_event()
                             fix_event->event_number, fix_event->correlated_event,
                             fix_event->ncoincident, fix_event->replica_number);
 
-    if (universe->uscreen) fmt::print(universe->uscreen, mesg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, mesg);
+    if (universe->uscreen) utils::print(universe->uscreen, mesg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, mesg);
   }
 }
 
diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp
index b9b04a63d5..38ddfceb84 100644
--- a/src/REPLICA/tad.cpp
+++ b/src/REPLICA/tad.cpp
@@ -366,8 +366,8 @@ void TAD::command(int narg, char **arg)
   if (me_universe == 0) {
     auto mesg = fmt::format("Loop time of {} on {} procs for {} steps with {} atoms\n",
                             timer->get_wall(Timer::TOTAL), nprocs_universe, nsteps,atom->natoms);
-    if (universe->uscreen) fmt::print(universe->uscreen, mesg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, mesg);
+    if (universe->uscreen) utils::print(universe->uscreen, mesg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, mesg);
   }
 
   if ((me_universe == 0) && ulogfile_neb) fclose(ulogfile_neb);
@@ -504,8 +504,8 @@ void TAD::log_event(int ievent)
                             fix_event->event_number, ievent, "E ", fix_event->ebarrier,
                             tfrac, fix_event->tlo, deltfirst);
 
-    if (universe->uscreen) fmt::print(universe->uscreen, mesg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, mesg);
+    if (universe->uscreen) utils::print(universe->uscreen, mesg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, mesg);
   }
 
   // dump snapshot of quenched coords
@@ -895,8 +895,8 @@ void TAD::compute_tlo(int ievent)
                             fix_event->event_number, ievent, statstr, ebarrier, tfrac,
                             fix_event->tlo, deltlo);
 
-    if (universe->uscreen) fmt::print(universe->uscreen, mesg);
-    if (universe->ulogfile) fmt::print(universe->ulogfile, mesg);
+    if (universe->uscreen) utils::print(universe->uscreen, mesg);
+    if (universe->ulogfile) utils::print(universe->ulogfile, mesg);
   }
 }
 
diff --git a/src/RIGID/compute_rigid_local.cpp b/src/RIGID/compute_rigid_local.cpp
index ea45389e7b..38a5788b99 100644
--- a/src/RIGID/compute_rigid_local.cpp
+++ b/src/RIGID/compute_rigid_local.cpp
@@ -13,14 +13,16 @@
 ------------------------------------------------------------------------- */
 
 #include "compute_rigid_local.h"
-#include <cstring>
+
 #include "atom.h"
-#include "update.h"
 #include "domain.h"
+#include "error.h"
 #include "modify.h"
 #include "fix_rigid_small.h"
 #include "memory.h"
-#include "error.h"
+#include "update.h"
+
+#include <cstring>
 
 using namespace LAMMPS_NS;
 
@@ -98,8 +100,8 @@ ComputeRigidLocal::~ComputeRigidLocal()
 {
   memory->destroy(vlocal);
   memory->destroy(alocal);
-  delete [] idrigid;
-  delete [] rstyle;
+  delete[] idrigid;
+  delete[] rstyle;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -108,16 +110,11 @@ void ComputeRigidLocal::init()
 {
   // set fixrigid
 
-  int ifix = modify->find_fix(idrigid);
-  if (ifix < 0)
-    error->all(FLERR,"FixRigidSmall ID for compute rigid/local does not exist");
-  fixrigid = dynamic_cast<FixRigidSmall *>(modify->fix[ifix]);
-
-  int flag = 0;
-  if (strstr(fixrigid->style,"rigid/") == nullptr) flag = 1;
-  if (strstr(fixrigid->style,"/small") == nullptr) flag = 1;
-  if (flag)
-    error->all(FLERR,"Compute rigid/local does not use fix rigid/small fix");
+  auto ifix = modify->get_fix_by_id(idrigid);
+  if (!ifix) error->all(FLERR,"FixRigidSmall ID {} for compute rigid/local does not exist", idrigid);
+  fixrigid = dynamic_cast<FixRigidSmall *>(ifix);
+  if (!fixrigid)
+    error->all(FLERR,"Fix ID {} for compute rigid/local does not point to fix rigid/small", idrigid);
 
   // do initial memory allocation so that memory_usage() is correct
 
diff --git a/src/RIGID/fix_rigid.cpp b/src/RIGID/fix_rigid.cpp
index 421a6cce2b..b0c6d46ee4 100644
--- a/src/RIGID/fix_rigid.cpp
+++ b/src/RIGID/fix_rigid.cpp
@@ -1,3 +1,4 @@
+
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -119,7 +120,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
     int nlocal = atom->nlocal;
     int custom_flag = strcmp(arg[3], "custom") == 0;
     if (custom_flag) {
-      if (narg < 5) utils::missing_cmd_args(FLERR, fmt::format("fix {} custom"), error);
+      if (narg < 5) utils::missing_cmd_args(FLERR, fmt::format("fix {} custom", style), error);
 
       // determine whether atom-style variable or atom property is used
 
@@ -219,11 +220,12 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
     // error if atom belongs to more than 1 rigid body
 
   } else if (strcmp(arg[3], "group") == 0) {
-    if (narg < 5) utils::missing_cmd_args(FLERR, fmt::format("fix {} group"), error);
+    if (narg < 5) utils::missing_cmd_args(FLERR, fmt::format("fix {} group", style), error);
     rstyle = GROUP;
     nbody = utils::inumeric(FLERR, arg[4], false, lmp);
     if (nbody <= 0) error->all(FLERR, "Illegal fix {} number of groups {}", style, nbody);
-    if (narg < 5 + nbody) utils::missing_cmd_args(FLERR, fmt::format("fix {} group"), error);
+    if (narg < 5 + nbody)
+      utils::missing_cmd_args(FLERR, fmt::format("fix {} group", style), error);
     iarg = 5 + nbody;
 
     int *igroups = new int[nbody];
@@ -2435,8 +2437,8 @@ void FixRigid::write_restart_file(const char *file)
   if (fp == nullptr)
     error->one(FLERR,"Cannot open fix rigid restart file {}: {}",outfile,utils::getsyserror());
 
-  fmt::print(fp,"# fix rigid mass, COM, inertia tensor info for {} bodies on timestep {}\n\n",nbody,update->ntimestep);
-  fmt::print(fp,"{}\n",nbody);
+  utils::print(fp,"# fix rigid mass, COM, inertia tensor info for {} bodies on timestep {}\n\n",nbody,update->ntimestep);
+  utils::print(fp,"{}\n",nbody);
 
   // compute I tensor against xyz axes from diagonalized I and current quat
   // Ispace = P Idiag P_transpose
diff --git a/src/RIGID/fix_rigid_small.cpp b/src/RIGID/fix_rigid_small.cpp
index 0bfd8032c5..6ba2e5eb1c 100644
--- a/src/RIGID/fix_rigid_small.cpp
+++ b/src/RIGID/fix_rigid_small.cpp
@@ -2625,9 +2625,9 @@ void FixRigidSmall::write_restart_file(const char *file)
       error->one(FLERR, "Cannot open fix {} restart file {}: {}",
                  style, outfile, utils::getsyserror());
 
-    fmt::print(fp,"# fix rigid mass, COM, inertia tensor info for "
+    utils::print(fp,"# fix rigid mass, COM, inertia tensor info for "
                "{} bodies on timestep {}\n\n",nbody,update->ntimestep);
-    fmt::print(fp,"{}\n",nbody);
+    utils::print(fp,"{}\n",nbody);
   }
 
   // communication buffer for all my rigid body info
diff --git a/src/SHOCK/fix_append_atoms.cpp b/src/SHOCK/fix_append_atoms.cpp
index 677b3b55fd..3f15d13df2 100644
--- a/src/SHOCK/fix_append_atoms.cpp
+++ b/src/SHOCK/fix_append_atoms.cpp
@@ -49,7 +49,6 @@ FixAppendAtoms::FixAppendAtoms(LAMMPS *lmp, int narg, char **arg) :
 
   scaleflag = 1;
   spatflag=0;
-  spatialid = nullptr;
   size = 0.0;
   xloflag = xhiflag = yloflag = yhiflag = zloflag = zhiflag = 0;
 
@@ -60,9 +59,6 @@ FixAppendAtoms::FixAppendAtoms(LAMMPS *lmp, int narg, char **arg) :
   rany = 0.0;
   ranz = 0.0;
 
-  randomx = nullptr;
-  randomt = nullptr;
-
   if (domain->lattice->nbasis == 0)
     error->all(FLERR,"Fix append/atoms requires a lattice be defined");
 
@@ -123,6 +119,7 @@ FixAppendAtoms::FixAppendAtoms(LAMMPS *lmp, int narg, char **arg) :
       if (strcmp(arg[iarg+1],"f_") == 0)
         error->all(FLERR, "Bad fix ID in fix append/atoms command");
       spatflag = 1;
+      delete[] spatialid;
       spatialid = utils::strdup(arg[iarg+1]+2);
       spatlead = utils::numeric(FLERR,arg[iarg+2],false,lmp);
       iarg += 3;
@@ -152,6 +149,7 @@ FixAppendAtoms::FixAppendAtoms(LAMMPS *lmp, int narg, char **arg) :
       ranz = utils::numeric(FLERR,arg[iarg+3],false,lmp);
       xseed = utils::inumeric(FLERR,arg[iarg+4],false,lmp);
       if (xseed <= 0) error->all(FLERR,"Illegal fix append/atoms command");
+      delete randomx;
       randomx = new RanMars(lmp,xseed + comm->me);
       iarg += 5;
     } else if (strcmp(arg[iarg],"temp") == 0) {
@@ -165,7 +163,10 @@ FixAppendAtoms::FixAppendAtoms(LAMMPS *lmp, int narg, char **arg) :
       if (t_period <= 0) error->all(FLERR,"Illegal fix append/atoms command");
       if (t_extent <= 0) error->all(FLERR,"Illegal fix append/atoms command");
       if (tseed <= 0) error->all(FLERR,"Illegal fix append/atoms command");
+      delete randomt;
       randomt = new RanMars(lmp,tseed + comm->me);
+      delete[] gfactor1;
+      delete[] gfactor2;
       gfactor1 = new double[atom->ntypes+1];
       gfactor2 = new double[atom->ntypes+1];
       iarg += 5;
diff --git a/src/SHOCK/fix_wall_piston.cpp b/src/SHOCK/fix_wall_piston.cpp
index cb20e2d683..76cdf1f3b8 100644
--- a/src/SHOCK/fix_wall_piston.cpp
+++ b/src/SHOCK/fix_wall_piston.cpp
@@ -39,10 +39,8 @@ FixWallPiston::FixWallPiston(LAMMPS *lmp, int narg, char **arg) :
   force_reneighbor = 1;
   next_reneighbor = -1;
 
-  if (narg < 4) error->all(FLERR,"Illegal fix wall/piston command");
+  if (narg < 4) utils::missing_cmd_args(FLERR,"fix wall/piston", error);
 
-  randomt = nullptr;
-  gfactor1 = gfactor2 = nullptr;
   tempflag = 0;
   scaleflag = 1;
   roughflag = 0;
@@ -92,6 +90,9 @@ FixWallPiston::FixWallPiston(LAMMPS *lmp, int narg, char **arg) :
       if (t_period <= 0) error->all(FLERR,"Illegal fix wall/piston command");
       if (t_extent <= 0) error->all(FLERR,"Illegal fix wall/piston command");
       if (tseed <= 0) error->all(FLERR,"Illegal fix wall/piston command");
+      delete randomt;
+      delete[] gfactor1;
+      delete[] gfactor2;
       randomt = new RanMars(lmp,tseed + comm->me);
       gfactor1 = new double[atom->ntypes+1];
       gfactor2 = new double[atom->ntypes+1];
diff --git a/src/SPIN/neb_spin.cpp b/src/SPIN/neb_spin.cpp
index b1b9dc077e..00b92b906a 100644
--- a/src/SPIN/neb_spin.cpp
+++ b/src/SPIN/neb_spin.cpp
@@ -805,7 +805,7 @@ void NEBSpin::print_status()
     FILE *uscreen = universe->uscreen;
     FILE *ulogfile = universe->ulogfile;
     if (uscreen) {
-      fmt::print(uscreen,"{} {:12.8g} {:12.8g} ",update->ntimestep,fmaxreplica,fmaxatom);
+      utils::print(uscreen,"{} {:12.8g} {:12.8g} ",update->ntimestep,fmaxreplica,fmaxatom);
       fprintf(uscreen,"%12.8g %12.8g %12.8g ",gradvnorm0,gradvnorm1,gradvnormc);
       fprintf(uscreen,"%12.8g %12.8g %12.8g ",ebf,ebr,endpt);
       for (int i = 0; i < nreplica; i++)
@@ -819,7 +819,7 @@ void NEBSpin::print_status()
     }
 
     if (ulogfile) {
-      fmt::print(ulogfile,"{} {:12.8} {:12.8g} ",update->ntimestep,fmaxreplica,fmaxatom);
+      utils::print(ulogfile,"{} {:12.8} {:12.8g} ",update->ntimestep,fmaxreplica,fmaxatom);
       fprintf(ulogfile,"%12.8g %12.8g %12.8g ",gradvnorm0,gradvnorm1,gradvnormc);
       fprintf(ulogfile,"%12.8g %12.8g %12.8g ",ebf,ebr,endpt);
       for (int i = 0; i < nreplica; i++)
diff --git a/src/SRD/fix_srd.cpp b/src/SRD/fix_srd.cpp
index 9b153a1c28..6b8ce1e9d6 100644
--- a/src/SRD/fix_srd.cpp
+++ b/src/SRD/fix_srd.cpp
@@ -3956,7 +3956,7 @@ void FixSRD::print_collision(int i, int j, int ibounce, double t_remain, double
   double **v = atom->v;
 
   if (type != WALL) {
-    fmt::print("COLLISION between SRD {} and BIG {}\n", atom->tag[i], atom->tag[j]);
+    utils::print("COLLISION between SRD {} and BIG {}\n", atom->tag[i], atom->tag[j]);
     printf("  bounce # = %d\n", ibounce + 1);
     printf("  local indices: %d %d\n", i, j);
     printf("  timestep = %g\n", dt);
@@ -3997,7 +3997,7 @@ void FixSRD::print_collision(int i, int j, int ibounce, double t_remain, double
   } else {
     int dim = wallwhich[j] / 2;
 
-    fmt::print("COLLISION between SRD {} and WALL {}\n", atom->tag[i], j);
+    utils::print("COLLISION between SRD {} and WALL {}\n", atom->tag[i], j);
     printf("  bounce # = %d\n", ibounce + 1);
     printf("  local indices: %d %d\n", i, j);
     printf("  timestep = %g\n", dt);
diff --git a/src/UEF/dump_cfg_uef.cpp b/src/UEF/dump_cfg_uef.cpp
index 776c4675f3..f2a9ae69aa 100644
--- a/src/UEF/dump_cfg_uef.cpp
+++ b/src/UEF/dump_cfg_uef.cpp
@@ -84,7 +84,7 @@ void DumpCFGUef::write_header(bigint n)
   if (atom->peri_flag) scale = atom->pdscale;
   else if (unwrapflag == 1) scale = UNWRAPEXPAND;
 
-  fmt::print(fp,"Number of particles = {}\n",n);
+  utils::print(fp,"Number of particles = {}\n",n);
   fprintf(fp,"A = %g Angstrom (basic length-scale)\n",scale);
   // in box[][] columns are cell edges
   // in H0, rows are cell edges
diff --git a/src/VORONOI/compute_voronoi_atom.cpp b/src/VORONOI/compute_voronoi_atom.cpp
index 4aa6ebf559..12ea173a23 100644
--- a/src/VORONOI/compute_voronoi_atom.cpp
+++ b/src/VORONOI/compute_voronoi_atom.cpp
@@ -55,16 +55,10 @@ ComputeVoronoi::ComputeVoronoi(LAMMPS *lmp, int narg, char **arg) :
   surface = VOROSURF_NONE;
   maxedge = 0;
   fthresh = ethresh = 0.0;
-  radstr = nullptr;
   onlyGroup = false;
   occupation = false;
 
-  con_mono = nullptr;
-  con_poly = nullptr;
-  tags = nullptr;
   oldmaxtag = 0;
-  occvec = sendocc = lroot = lnext = nullptr;
-  faces = nullptr;
 
   int iarg = 3;
   while (iarg<narg) {
@@ -79,6 +73,7 @@ ComputeVoronoi::ComputeVoronoi(LAMMPS *lmp, int narg, char **arg) :
     else if (strcmp(arg[iarg], "radius") == 0) {
       if (iarg + 2 > narg || strstr(arg[iarg+1],"v_") != arg[iarg+1] )
         error->all(FLERR,"Illegal compute voronoi/atom command");
+      delete[] radstr;
       radstr = utils::strdup(&arg[iarg+1][2]);
       iarg += 2;
     }
diff --git a/src/accelerator_kokkos.h b/src/accelerator_kokkos.h
index dec52b2363..2ab0ea01d5 100644
--- a/src/accelerator_kokkos.h
+++ b/src/accelerator_kokkos.h
@@ -23,6 +23,7 @@
 #include "comm_kokkos.h"          // IWYU pragma: export
 #include "comm_tiled_kokkos.h"    // IWYU pragma: export
 #include "domain_kokkos.h"        // IWYU pragma: export
+#include "group_kokkos.h"         // IWYU pragma: export
 #include "kokkos.h"               // IWYU pragma: export
 #include "memory_kokkos.h"        // IWYU pragma: export
 #include "modify_kokkos.h"        // IWYU pragma: export
@@ -39,6 +40,7 @@
 #include "comm_brick.h"
 #include "comm_tiled.h"
 #include "domain.h"
+#include "group.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
@@ -86,6 +88,11 @@ class DomainKokkos : public Domain {
   DomainKokkos(class LAMMPS *lmp) : Domain(lmp) {}
 };
 
+class GroupKokkos : public Group {
+ public:
+  GroupKokkos(class LAMMPS *lmp) : Group(lmp) {}
+};
+
 class NeighborKokkos : public Neighbor {
  public:
   NeighborKokkos(class LAMMPS *lmp) : Neighbor(lmp) {}
diff --git a/src/angle_write.cpp b/src/angle_write.cpp
index 1be5f1acac..863183995b 100644
--- a/src/angle_write.cpp
+++ b/src/angle_write.cpp
@@ -106,7 +106,7 @@ void AngleWrite::command(int narg, char **arg)
                      utils::current_date());
       fp = fopen(table_file.c_str(), "w");
       if (fp)
-        fmt::print(fp, "# DATE: {} UNITS: {} Created by angle_write\n", utils::current_date(),
+        utils::print(fp, "# DATE: {} UNITS: {} Created by angle_write\n", utils::current_date(),
                    update->unit_style);
     }
     if (fp == nullptr)
@@ -173,9 +173,9 @@ void AngleWrite::command(int narg, char **arg)
 
     // evaluate energy and force at each of N distances
 
-    fmt::print(fp, "# Angle potential {} for angle type {}: i,theta,energy,force\n",
+    utils::print(fp, "# Angle potential {} for angle type {}: i,theta,energy,force\n",
                force->angle_style, atype);
-    fmt::print(fp, "\n{}\nN {} EQ {:.15g}\n\n", keyword, n, theta0);
+    utils::print(fp, "\n{}\nN {} EQ {:.15g}\n\n", keyword, n, theta0);
 
 #define GET_ENERGY(myphi, mytheta) \
   theta = mytheta;                 \
diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 0529d5cb54..bdcbc90997 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -1797,7 +1797,7 @@ void AtomVec::write_data(FILE *fp, int n, double **buf)
   int i, j, m, nn, datatype, cols;
 
   for (i = 0; i < n; i++) {
-    fmt::print(fp, "{}", ubuf(buf[i][0]).i);
+    utils::print(fp, "{}", ubuf(buf[i][0]).i);
 
     j = 1;
     for (nn = 1; nn < ndata_atom; nn++) {
@@ -1805,30 +1805,30 @@ void AtomVec::write_data(FILE *fp, int n, double **buf)
       cols = mdata_atom.cols[nn];
       if (datatype == Atom::DOUBLE) {
         if (cols == 0) {
-          fmt::print(fp, " {:.16}", buf[i][j++]);
+          utils::print(fp, " {:.16}", buf[i][j++]);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", buf[i][j++]);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", buf[i][j++]);
         }
       } else if (datatype == Atom::INT) {
         if (cols == 0) {
           if (atom->types_style == Atom::LABELS &&
               atom->peratom[mdata_atom.index[nn]].name == "type") {
-            fmt::print(fp, " {}", atom->lmap->typelabel[ubuf(buf[i][j++]).i - 1]);
+            utils::print(fp, " {}", atom->lmap->typelabel[ubuf(buf[i][j++]).i - 1]);
           } else
-            fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+            utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         }
       } else if (datatype == Atom::BIGINT) {
         if (cols == 0) {
-          fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         }
       }
     }
 
-    fmt::print(fp, " {} {} {}\n", ubuf(buf[i][j]).i, ubuf(buf[i][j + 1]).i, ubuf(buf[i][j + 2]).i);
+    utils::print(fp, " {} {} {}\n", ubuf(buf[i][j]).i, ubuf(buf[i][j + 1]).i, ubuf(buf[i][j + 2]).i);
   }
 }
 
@@ -1940,7 +1940,7 @@ void AtomVec::write_vel(FILE *fp, int n, double **buf)
   int i, j, m, nn, datatype, cols;
 
   for (i = 0; i < n; i++) {
-    fmt::print(fp, "{}", ubuf(buf[i][0]).i);
+    utils::print(fp, "{}", ubuf(buf[i][0]).i);
 
     j = 1;
     for (nn = 1; nn < ndata_vel; nn++) {
@@ -1948,21 +1948,21 @@ void AtomVec::write_vel(FILE *fp, int n, double **buf)
       cols = mdata_vel.cols[nn];
       if (datatype == Atom::DOUBLE) {
         if (cols == 0) {
-          fmt::print(fp, " {}", buf[i][j++]);
+          utils::print(fp, " {}", buf[i][j++]);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", buf[i][j++]);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", buf[i][j++]);
         }
       } else if (datatype == Atom::INT) {
         if (cols == 0) {
-          fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         }
       } else if (datatype == Atom::BIGINT) {
         if (cols == 0) {
-          fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         } else {
-          for (m = 0; m < cols; m++) fmt::print(fp, " {}", ubuf(buf[i][j++]).i);
+          for (m = 0; m < cols; m++) utils::print(fp, " {}", ubuf(buf[i][j++]).i);
         }
       }
     }
@@ -2026,7 +2026,7 @@ void AtomVec::write_bond(FILE *fp, int n, tagint **buf, int index)
   for (int i = 0; i < n; i++) {
     typestr = std::to_string(buf[i][0]);
     if (atom->types_style == Atom::LABELS) typestr = atom->lmap->btypelabel[buf[i][0] - 1];
-    fmt::print(fp, "{} {} {} {}\n", index, typestr, buf[i][1], buf[i][2]);
+    utils::print(fp, "{} {} {} {}\n", index, typestr, buf[i][1], buf[i][2]);
     index++;
   }
 }
@@ -2091,7 +2091,7 @@ void AtomVec::write_angle(FILE *fp, int n, tagint **buf, int index)
   for (int i = 0; i < n; i++) {
     typestr = std::to_string(buf[i][0]);
     if (atom->types_style == Atom::LABELS) typestr = atom->lmap->atypelabel[buf[i][0] - 1];
-    fmt::print(fp, "{} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3]);
+    utils::print(fp, "{} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3]);
     index++;
   }
 }
@@ -2154,7 +2154,7 @@ void AtomVec::write_dihedral(FILE *fp, int n, tagint **buf, int index)
   for (int i = 0; i < n; i++) {
     typestr = std::to_string(buf[i][0]);
     if (atom->types_style == Atom::LABELS) typestr = atom->lmap->dtypelabel[buf[i][0] - 1];
-    fmt::print(fp, "{} {} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3],
+    utils::print(fp, "{} {} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3],
                buf[i][4]);
     index++;
   }
@@ -2218,7 +2218,7 @@ void AtomVec::write_improper(FILE *fp, int n, tagint **buf, int index)
   for (int i = 0; i < n; i++) {
     typestr = std::to_string(buf[i][0]);
     if (atom->types_style == Atom::LABELS) typestr = atom->lmap->itypelabel[buf[i][0] - 1];
-    fmt::print(fp, "{} {} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3],
+    utils::print(fp, "{} {} {} {} {} {}\n", index, typestr, buf[i][1], buf[i][2], buf[i][3],
                buf[i][4]);
     index++;
   }
diff --git a/src/atom_vec_ellipsoid.cpp b/src/atom_vec_ellipsoid.cpp
index 417c3cf5fa..cd54729798 100644
--- a/src/atom_vec_ellipsoid.cpp
+++ b/src/atom_vec_ellipsoid.cpp
@@ -530,7 +530,7 @@ void AtomVecEllipsoid::write_data_bonus(FILE *fp, int n, double *buf, int /*flag
 {
   int i = 0;
   while (i < n) {
-    fmt::print(fp, "{} {} {} {} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2], buf[i + 3],
+    utils::print(fp, "{} {} {} {} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2], buf[i + 3],
                buf[i + 4], buf[i + 5], buf[i + 6], buf[i + 7]);
     i += size_data_bonus;
   }
diff --git a/src/atom_vec_line.cpp b/src/atom_vec_line.cpp
index 6ec4836770..ef5056c93c 100644
--- a/src/atom_vec_line.cpp
+++ b/src/atom_vec_line.cpp
@@ -564,7 +564,7 @@ void AtomVecLine::write_data_bonus(FILE *fp, int n, double *buf, int /*flag*/)
 {
   int i = 0;
   while (i < n) {
-    fmt::print(fp, "{} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2], buf[i + 3],
+    utils::print(fp, "{} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2], buf[i + 3],
                buf[i + 4]);
     i += size_data_bonus;
   }
diff --git a/src/atom_vec_tri.cpp b/src/atom_vec_tri.cpp
index 888ed954fc..21e7682b32 100644
--- a/src/atom_vec_tri.cpp
+++ b/src/atom_vec_tri.cpp
@@ -780,7 +780,7 @@ void AtomVecTri::write_data_bonus(FILE *fp, int n, double *buf, int /*flag*/)
 {
   int i = 0;
   while (i < n) {
-    fmt::print(fp, "{} {} {} {} {} {} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2],
+    utils::print(fp, "{} {} {} {} {} {} {} {} {} {}\n", ubuf(buf[i]).i, buf[i + 1], buf[i + 2],
                buf[i + 3], buf[i + 4], buf[i + 5], buf[i + 6], buf[i + 7], buf[i + 8], buf[i + 9]);
     i += size_data_bonus;
   }
diff --git a/src/balance.cpp b/src/balance.cpp
index 42463752f9..fa4e68a8b8 100644
--- a/src/balance.cpp
+++ b/src/balance.cpp
@@ -1219,7 +1219,7 @@ void Balance::dumpout(bigint tstep)
   double *boxlo = domain->boxlo;
   double *boxhi = domain->boxhi;
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
   fprintf(fp,"ITEM: NUMBER OF NODES\n");
   if (dimension == 2) fprintf(fp,"%d\n",4*nprocs);
   else fprintf(fp,"%d\n",8*nprocs);
@@ -1294,7 +1294,7 @@ void Balance::dumpout(bigint tstep)
 
   // write out one square/cube per processor for 2d/3d
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
   if (dimension == 2) fprintf(fp,"ITEM: NUMBER OF SQUARES\n");
   else fprintf(fp,"ITEM: NUMBER OF CUBES\n");
   fprintf(fp,"%d\n",nprocs);
@@ -1339,13 +1339,13 @@ void Balance::debug_shift_output(int idim, int m, int np, double *split)
   fprintf(stderr,"Dimension %s, Iteration %d\n",dim,m);
 
   fprintf(stderr,"  Count:");
-  for (i = 0; i <= np; i++) fmt::print(stderr," {}",count[i]);
+  for (i = 0; i <= np; i++) utils::print(stderr," {}",count[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Sum:");
-  for (i = 0; i <= np; i++) fmt::print(stderr," {}",sum[i]);
+  for (i = 0; i <= np; i++) utils::print(stderr," {}",sum[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Target:");
-  for (i = 0; i <= np; i++) fmt::print(stderr," {}",target[i]);
+  for (i = 0; i <= np; i++) utils::print(stderr," {}",target[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Actual cut:");
   for (i = 0; i <= np; i++)
@@ -1358,13 +1358,13 @@ void Balance::debug_shift_output(int idim, int m, int np, double *split)
   for (i = 0; i <= np; i++) fprintf(stderr," %g",lo[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Low-sum:");
-  for (i = 0; i <= np; i++) fmt::print(stderr," {}",losum[i]);
+  for (i = 0; i <= np; i++) utils::print(stderr," {}",losum[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Hi:");
   for (i = 0; i <= np; i++) fprintf(stderr," %g",hi[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Hi-sum:");
-  for (i = 0; i <= np; i++) fmt::print(stderr," {}",hisum[i]);
+  for (i = 0; i <= np; i++) utils::print(stderr," {}",hisum[i]);
   fprintf(stderr,"\n");
   fprintf(stderr,"  Delta:");
   for (i = 0; i < np; i++) fprintf(stderr," %g",split[i+1]-split[i]);
diff --git a/src/bond.cpp b/src/bond.cpp
index e0ca37aa73..f5af30062e 100644
--- a/src/bond.cpp
+++ b/src/bond.cpp
@@ -388,7 +388,7 @@ void Bond::write_file(int narg, char **arg)
                      utils::current_date());
       fp = fopen(table_file.c_str(), "w");
       if (fp)
-        fmt::print(fp, "# DATE: {} UNITS: {} Created by bond_write\n", utils::current_date(),
+        utils::print(fp, "# DATE: {} UNITS: {} Created by bond_write\n", utils::current_date(),
                    update->unit_style);
     }
     if (fp == nullptr)
diff --git a/src/citeme.cpp b/src/citeme.cpp
index 56ba7fa3a4..949ae38305 100644
--- a/src/citeme.cpp
+++ b/src/citeme.cpp
@@ -19,13 +19,13 @@
 
 using namespace LAMMPS_NS;
 
-static const char cite_separator[] =
+static constexpr char cite_separator[] =
     "CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE\n\n";
 
-static const char cite_nagline[] =
+static constexpr char cite_nagline[] =
     "Your simulation uses code contributions which should be cited:\n";
 
-static const char cite_file[] = "The {} {} lists these citations in BibTeX format.\n\n";
+static constexpr char cite_file[] = "The {} {} lists these citations in BibTeX format.\n\n";
 
 // define hash function
 static std::hash<std::string> get_hash;
diff --git a/src/comm_tiled.cpp b/src/comm_tiled.cpp
index e8b5d19fa5..bc4674a6b6 100644
--- a/src/comm_tiled.cpp
+++ b/src/comm_tiled.cpp
@@ -2507,7 +2507,8 @@ void CommTiled::deallocate_swap(int n)
     memory->destroy(sendbox_multi[i]);
     memory->destroy(sendbox_multiold[i]);
 
-    delete [] maxsendlist[i];
+    if (maxsendlist)
+      delete [] maxsendlist[i];
 
     if (sendlist && sendlist[i]) {
       for (int j = 0; j < nprocmax[i]; j++) memory->destroy(sendlist[i][j]);
diff --git a/src/comm_tiled.h b/src/comm_tiled.h
index 64b80d8d18..751a74d1b7 100644
--- a/src/comm_tiled.h
+++ b/src/comm_tiled.h
@@ -153,7 +153,7 @@ class CommTiled : public Comm {
   virtual void grow_swap_send(int, int, int);    // grow swap arrays for send and recv
   void grow_swap_send_multi(int, int);           // grow multi swap arrays for send and recv
   void grow_swap_recv(int, int);
-  void deallocate_swap(int);    // deallocate swap arrays
+  void deallocate_swap(int);                     // deallocate swap arrays
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/compute_rdf.cpp b/src/compute_rdf.cpp
index 372bebbfc7..0dc7da3460 100644
--- a/src/compute_rdf.cpp
+++ b/src/compute_rdf.cpp
@@ -168,30 +168,24 @@ ComputeRDF::~ComputeRDF()
 
 void ComputeRDF::init()
 {
+  const double skin = neighbor->skin;
 
   if (!force->pair && !cutflag)
     error->all(FLERR,"Compute rdf requires a pair style or an explicit cutoff");
 
   if (cutflag) {
-    double skin = neighbor->skin;
     mycutneigh = cutoff_user + skin;
 
     double cutghost;            // as computed by Neighbor and Comm
-    if (force->pair)
-      cutghost = MAX(force->pair->cutforce+skin,comm->cutghostuser);
-    else
-      cutghost = comm->cutghostuser;
+    if (force->pair) cutghost = MAX(force->pair->cutforce+skin,comm->cutghostuser);
+    else cutghost = comm->cutghostuser;
 
     if (mycutneigh > cutghost)
-      error->all(FLERR,"Compute rdf cutoff exceeds ghost atom range - "
-                 "use comm_modify cutoff command");
-    if (force->pair && mycutneigh < force->pair->cutforce + skin)
-      if (comm->me == 0)
-        error->warning(FLERR,"Compute rdf cutoff less than neighbor cutoff - "
-                       "forcing a needless neighbor list build");
+      error->all(FLERR,"Compute rdf cutoff plus skin {} exceeds ghost atom range {} - "
+                 "use comm_modify cutoff command to increase it", mycutneigh, cutghost);
 
     delr = cutoff_user / nbin;
-  } else delr = force->pair->cutforce / nbin;
+  } delr = force->pair->cutforce / nbin;
 
   delrinv = 1.0/delr;
 
diff --git a/src/compute_reduce.cpp b/src/compute_reduce.cpp
index 40bb206bd2..6c4bafab4f 100644
--- a/src/compute_reduce.cpp
+++ b/src/compute_reduce.cpp
@@ -218,7 +218,7 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
         input_mode = PERATOM;
       else if (strcmp(arg[iarg + 1], "local") == 0)
         input_mode = LOCAL;
-      iarg += 2;
+      iarg += 1;
     } else
       error->all(FLERR, "Unknown compute {} keyword: {}", style, arg[iarg]);
   }
diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp
index ade6bdc3c1..b5e72393d3 100644
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@@ -103,7 +103,7 @@ void CreateAtoms::command(int narg, char **arg)
     style = REGION;
     if (narg < 3) utils::missing_cmd_args(FLERR, "create_atoms region", error);
     region = domain->get_region_by_id(arg[2]);
-    if (!region) error->all(FLERR, "Create_atoms region {} does not exist", arg[2]);
+    if (!region) error->all(FLERR, 2, "Create_atoms region {} does not exist", arg[2]);
     region->init();
     region->prematch();
     iarg = 3;
@@ -127,7 +127,7 @@ void CreateAtoms::command(int narg, char **arg)
       region = nullptr;
     else {
       region = domain->get_region_by_id(arg[4]);
-      if (!region) error->all(FLERR, "Create_atoms region {} does not exist", arg[4]);
+      if (!region) error->all(FLERR, 4, "Create_atoms region {} does not exist", arg[4]);
       region->init();
       region->prematch();
     }
@@ -138,7 +138,7 @@ void CreateAtoms::command(int narg, char **arg)
     meshfile = arg[2];
     iarg = 3;
   } else
-    error->all(FLERR, "Unknown create_atoms command option {}", arg[1]);
+    error->all(FLERR, 1, "Unknown create_atoms command option {}", arg[1]);
 
   // process optional keywords
 
diff --git a/src/dihedral_write.cpp b/src/dihedral_write.cpp
index 51041c46fc..11d283dc27 100644
--- a/src/dihedral_write.cpp
+++ b/src/dihedral_write.cpp
@@ -107,7 +107,7 @@ void DihedralWrite::command(int narg, char **arg)
                      utils::current_date());
       fp = fopen(table_file.c_str(), "w");
       if (fp)
-        fmt::print(fp, "# DATE: {} UNITS: {} Created by dihedral_write\n", utils::current_date(),
+        utils::print(fp, "# DATE: {} UNITS: {} Created by dihedral_write\n", utils::current_date(),
                    update->unit_style);
     }
     if (fp == nullptr)
@@ -169,9 +169,9 @@ void DihedralWrite::command(int narg, char **arg)
 
     // evaluate energy and force at each of N distances
 
-    fmt::print(fp, "# Dihedral potential {} for dihedral type {}: i,theta,energy,force\n",
+    utils::print(fp, "# Dihedral potential {} for dihedral type {}: i,theta,energy,force\n",
                force->dihedral_style, dtype);
-    fmt::print(fp, "\n{}\nN {} DEGREES\n\n", keyword, n);
+    utils::print(fp, "\n{}\nN {} DEGREES\n\n", keyword, n);
 
 #define GET_ENERGY(myphi, mytheta)     \
   theta = mytheta;                     \
diff --git a/src/dump_atom.cpp b/src/dump_atom.cpp
index 2238a3a81d..e81157b4ad 100644
--- a/src/dump_atom.cpp
+++ b/src/dump_atom.cpp
@@ -369,19 +369,19 @@ void DumpAtom::header_item(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp, "ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
+  utils::print(fp, "ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS {}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxylo,boxyhi,boxzlo,boxzhi);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -390,19 +390,19 @@ void DumpAtom::header_item_triclinic(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp, "ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
+  utils::print(fp, "ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxxy,boxylo,boxyhi,boxxz,boxzlo,boxzhi,boxyz);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -411,13 +411,13 @@ void DumpAtom::header_item_triclinic_general(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
+  utils::print(fp,"ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS abc origin {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS abc origin {}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n",
@@ -426,7 +426,7 @@ void DumpAtom::header_item_triclinic_general(bigint ndump)
              domain->bvec[0],domain->bvec[1],domain->bvec[2],domain->boxlo[1],
              domain->cvec[0],domain->cvec[1],domain->cvec[2],domain->boxlo[2]);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp
index 0d22ece2c3..d756977487 100644
--- a/src/dump_cfg.cpp
+++ b/src/dump_cfg.cpp
@@ -142,7 +142,7 @@ void DumpCFG::write_header(bigint n)
       header += fmt::format("auxiliary[{}] = {}\n",i-5,keyword_user[i]);
     else
       header += fmt::format("auxiliary[{}] = {}\n",i-5,auxname[i-5]);
-  fmt::print(fp, header);
+  utils::print(fp, header);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp
index fdba2e5477..dd53511e09 100644
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@@ -647,21 +647,21 @@ void DumpCustom::header_item(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n"
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n"
              "ITEM: NUMBER OF ATOMS\n{}\n",
              update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS {}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxylo,boxyhi,boxzlo,boxzhi);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -670,21 +670,21 @@ void DumpCustom::header_item_triclinic(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n"
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n"
              "ITEM: NUMBER OF ATOMS\n{}\n",
              update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxxy,boxylo,boxyhi,boxxz,boxzlo,boxzhi,boxyz);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -693,13 +693,13 @@ void DumpCustom::header_item_triclinic_general(bigint ndump)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
+  utils::print(fp,"ITEM: TIMESTEP\n{}\nITEM: NUMBER OF ATOMS\n{}\n", update->ntimestep, ndump);
 
-  fmt::print(fp,"ITEM: BOX BOUNDS abc origin {}\n"
+  utils::print(fp,"ITEM: BOX BOUNDS abc origin {}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e} {:>1.16e}\n",
@@ -708,7 +708,7 @@ void DumpCustom::header_item_triclinic_general(bigint ndump)
              domain->bvec[0],domain->bvec[1],domain->bvec[2],domain->boxlo[1],
              domain->cvec[0],domain->cvec[1],domain->cvec[2],domain->boxlo[2]);
 
-  fmt::print(fp,"ITEM: ATOMS {}\n",columns);
+  utils::print(fp,"ITEM: ATOMS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_grid.cpp b/src/dump_grid.cpp
index b052712e95..2d3d9a105b 100644
--- a/src/dump_grid.cpp
+++ b/src/dump_grid.cpp
@@ -431,19 +431,19 @@ void DumpGrid::header_item(bigint /*ndump*/)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",update->ntimestep);
-  fmt::print(fp,"ITEM: BOX BOUNDS {}\n"
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n",update->ntimestep);
+  utils::print(fp,"ITEM: BOX BOUNDS {}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxylo,boxyhi,boxzlo,boxzhi);
-  fmt::print(fp,"ITEM: DIMENSION\n{}\n",domain->dimension);
-  fmt::print(fp,"ITEM: GRID SIZE nx ny nz\n{} {} {}\n",nxgrid,nygrid,nzgrid);
-  fmt::print(fp,"ITEM: GRID CELLS {}\n",columns);
+  utils::print(fp,"ITEM: DIMENSION\n{}\n",domain->dimension);
+  utils::print(fp,"ITEM: GRID SIZE nx ny nz\n{} {} {}\n",nxgrid,nygrid,nzgrid);
+  utils::print(fp,"ITEM: GRID CELLS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -452,19 +452,19 @@ void DumpGrid::header_item_triclinic(bigint /*ndump*/)
 {
   if (unit_flag && !unit_count) {
     ++unit_count;
-    fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+    utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
   }
-  if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+  if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",update->ntimestep);
-  fmt::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
+  utils::print(fp,"ITEM: TIMESTEP\n{}\n",update->ntimestep);
+  utils::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n"
              "{:>1.16e} {:>1.16e} {:>1.16e}\n",
              boundstr,boxxlo,boxxhi,boxxy,boxylo,boxyhi,boxxz,boxzlo,boxzhi,boxyz);
-  fmt::print(fp,"ITEM: DIMENSION\n{}\n",domain->dimension);
-  fmt::print(fp,"ITEM: GRID SIZE nx ny nz\n{} {} {}\n",nxgrid,nygrid,nzgrid);
-  fmt::print(fp,"ITEM: GRID CELLS {}\n",columns);
+  utils::print(fp,"ITEM: DIMENSION\n{}\n",domain->dimension);
+  utils::print(fp,"ITEM: GRID SIZE nx ny nz\n{} {} {}\n",nxgrid,nygrid,nzgrid);
+  utils::print(fp,"ITEM: GRID CELLS {}\n",columns);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_image.cpp b/src/dump_image.cpp
index 9610ef4d9a..abf4d366d5 100644
--- a/src/dump_image.cpp
+++ b/src/dump_image.cpp
@@ -184,12 +184,12 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
 
       char *id;
       int igrid,idata,index;
-      int iflag =
-        utils::check_grid_reference((char *) "Dump image",
-                                    arg[iarg+1],nevery,id,
-                                    igrid,idata,index,lmp);
+      int iflag = utils::check_grid_reference((char *) "Dump image", arg[iarg+1], nevery, id,
+                                              igrid,idata,index,lmp);
       if (iflag < 0) error->all(FLERR,"Invalid grid reference in dump image command");
 
+      delete[] id_grid_compute;
+      delete[] id_grid_fix;
       if (iflag == ArgInfo::COMPUTE) id_grid_compute = utils::strdup(id);
       else if (iflag == ArgInfo::FIX) id_grid_fix = utils::strdup(id);
       delete[] id;
@@ -252,6 +252,7 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"view") == 0) {
       if (iarg+3 > narg) error->all(FLERR,"Illegal dump image command");
       if (utils::strmatch(arg[iarg+1],"^v_")) {
+        delete[] thetastr;
         thetastr = utils::strdup(arg[iarg+1]+2);
       } else {
         const double theta = utils::numeric(FLERR,arg[iarg+1],false,lmp);
@@ -260,6 +261,7 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
         image->theta = DEG2RAD * theta;
       }
       if (utils::strmatch(arg[iarg+2],"^v_")) {
+        delete[] phistr;
         phistr = utils::strdup(arg[iarg+2]+2);
       } else {
         image->phi = DEG2RAD * utils::numeric(FLERR,arg[iarg+2],false,lmp);
@@ -272,14 +274,17 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+1],"d") == 0) cflag = DYNAMIC;
       else error->all(FLERR,"Illegal dump image command");
       if (utils::strmatch(arg[iarg+2],"^v_")) {
+        delete[] cxstr;
         cxstr = utils::strdup(arg[iarg+2]+2);
         cflag = DYNAMIC;
       } else cx = utils::numeric(FLERR,arg[iarg+2],false,lmp);
       if (utils::strmatch(arg[iarg+3],"^v_")) {
+        delete[] cystr;
         cystr = utils::strdup(arg[iarg+3]+2);
         cflag = DYNAMIC;
       } else cy = utils::numeric(FLERR,arg[iarg+3],false,lmp);
       if (utils::strmatch(arg[iarg+4],"^v_")) {
+        delete[] czstr;
         czstr = utils::strdup(arg[iarg+4]+2);
         cflag = DYNAMIC;
       } else cz = utils::numeric(FLERR,arg[iarg+4],false,lmp);
@@ -288,12 +293,15 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"up") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal dump image command");
       if (utils::strmatch(arg[iarg+1],"^v_")) {
+        delete[] upxstr;
         upxstr = utils::strdup(arg[iarg+1]+2);
       } else image->up[0] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       if (utils::strmatch(arg[iarg+2],"^v_")) {
+        delete[] upystr;
         upystr = utils::strdup(arg[iarg+2]+2);
       } else image->up[1] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
       if (utils::strmatch(arg[iarg+3],"^v_")) {
+        delete[] upzstr;
         upzstr = utils::strdup(arg[iarg+3]+2);
       } else image->up[2] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
       iarg += 4;
@@ -301,6 +309,7 @@ DumpImage::DumpImage(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"zoom") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump image command");
       if (utils::strmatch(arg[iarg+1],"^v_")) {
+        delete[] zoomstr;
         zoomstr = utils::strdup(arg[iarg+1]+2);
       } else {
         double zoom = utils::numeric(FLERR,arg[iarg+1],false,lmp);
diff --git a/src/dump_local.cpp b/src/dump_local.cpp
index bcf2a3a757..7394ec4481 100644
--- a/src/dump_local.cpp
+++ b/src/dump_local.cpp
@@ -292,29 +292,29 @@ void DumpLocal::write_header(bigint ndump)
   if (me == 0) {
     if (unit_flag && !unit_count) {
       ++unit_count;
-      fmt::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
+      utils::print(fp,"ITEM: UNITS\n{}\n",update->unit_style);
     }
-    if (time_flag) fmt::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
+    if (time_flag) utils::print(fp,"ITEM: TIME\n{:.16}\n",compute_time());
 
-    fmt::print(fp,"ITEM: TIMESTEP\n{}\n"
+    utils::print(fp,"ITEM: TIMESTEP\n{}\n"
                "ITEM: NUMBER OF {}\n{}\n",
                update->ntimestep, label, ndump);
 
     if (domain->triclinic) {
-      fmt::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
+      utils::print(fp,"ITEM: BOX BOUNDS xy xz yz {}\n"
                  "{:>1.16e} {:>1.16e} {:>1.16e}\n"
                  "{:>1.16e} {:>1.16e} {:>1.16e}\n"
                  "{:>1.16e} {:>1.16e} {:>1.16e}\n",
                  boundstr,boxxlo,boxxhi,boxxy,boxylo,boxyhi,boxxz,boxzlo,boxzhi,boxyz);
     } else {
-      fmt::print(fp,"ITEM: BOX BOUNDS {}\n"
+      utils::print(fp,"ITEM: BOX BOUNDS {}\n"
                  "{:>1.16e} {:>1.16e}\n"
                  "{:>1.16e} {:>1.16e}\n"
                  "{:>1.16e} {:>1.16e}\n",
                  boundstr,boxxlo,boxxhi,boxylo,boxyhi,boxzlo,boxzhi);
     }
 
-    fmt::print(fp,"ITEM: {} {}\n", label, columns);
+    utils::print(fp,"ITEM: {} {}\n", label, columns);
   }
 }
 
diff --git a/src/dump_xyz.cpp b/src/dump_xyz.cpp
index 84a8ead6fd..69892888bc 100644
--- a/src/dump_xyz.cpp
+++ b/src/dump_xyz.cpp
@@ -161,8 +161,7 @@ void DumpXYZ::write_header(bigint n)
 
     auto header = fmt::format("{}\n Atoms. Timestep: {}", n, update->ntimestep);
     if (time_flag) header += fmt::format(" Time: {:.6f}", compute_time());
-    header += "\n";
-    fmt::print(fp, header);
+    utils::print(fp, header + "\n");
   }
 }
 
diff --git a/src/error.cpp b/src/error.cpp
index e591091b35..7322c2146d 100644
--- a/src/error.cpp
+++ b/src/error.cpp
@@ -103,7 +103,7 @@ void Error::universe_warn(const std::string &file, int line, const std::string &
   ++numwarn;
   if ((maxwarn != 0) && ((numwarn > maxwarn) || (allwarn > maxwarn) || (maxwarn < 0))) return;
   if (universe->uscreen)
-    fmt::print(universe->uscreen,"WARNING on proc {}: {} ({}:{})\n",
+    utils::print(universe->uscreen,"WARNING on proc {}: {} ({}:{})\n",
                universe->me,str,truncpath(file),line);
 }
 
@@ -114,37 +114,27 @@ void Error::universe_warn(const std::string &file, int line, const std::string &
    force MPI_Abort if running in multi-partition mode
 ------------------------------------------------------------------------- */
 
-void Error::all(const std::string &file, int line, const std::string &str)
+void Error::all(const std::string &file, int line, int failed, const std::string &str)
 {
   MPI_Barrier(world);
 
-  int me;
   std::string lastcmd = "(unknown)";
+  std::string mesg = "ERROR: " + str + fmt::format(" ({}:{})\n",  truncpath(file), line);
 
-  MPI_Comm_rank(world,&me);
+  // add text about the input following the error message
 
-  if (me == 0) {
-    std::string mesg = "ERROR: " + str;
-    if (input && input->line) lastcmd = input->line;
-    try {
-      mesg += fmt::format(" ({}:{})\nLast command: {}\n", truncpath(file),line,lastcmd);
-    } catch (fmt::format_error &) {
-      ; // do nothing
-    }
-    utils::logmesg(lmp,mesg);
-  }
+  if (failed > NOLASTLINE) mesg += utils::point_to_error(input, failed);
+  if (comm->me == 0) utils::logmesg(lmp,mesg);
 
   // allow commands if an exception was caught in a run
   // update may be a null pointer when catching command-line errors
 
   if (update) update->whichflag = 0;
 
-  std::string msg = fmt::format("ERROR: {} ({}:{})\n", str, truncpath(file), line);
-
   if (universe->nworlds > 1)
-    throw LAMMPSAbortException(msg, universe->uworld);
+    throw LAMMPSAbortException(mesg, universe->uworld);
   else
-    throw LAMMPSException(msg);
+    throw LAMMPSException(mesg);
 }
 
 /* ----------------------------------------------------------------------
@@ -154,15 +144,13 @@ void Error::all(const std::string &file, int line, const std::string &str)
    forces abort of entire world (and universe) if any proc in world calls
 ------------------------------------------------------------------------- */
 
-void Error::one(const std::string &file, int line, const std::string &str)
+void Error::one(const std::string &file, int line, int failed, const std::string &str)
 {
-  int me;
   std::string lastcmd = "(unknown)";
-  MPI_Comm_rank(world,&me);
 
-  if (input && input->line) lastcmd = input->line;
-  std::string mesg = fmt::format("ERROR on proc {}: {} ({}:{})\nLast command: {}\n",
-                                 me,str,truncpath(file),line,lastcmd);
+  std::string mesg = fmt::format("ERROR on proc {}: {} ({}:{})\n", comm->me, str,
+                                 truncpath(file), line);
+  if (failed > NOPOINTER) mesg += utils::point_to_error(input, failed);
   utils::logmesg(lmp,mesg);
 
   if (universe->nworlds > 1)
@@ -177,27 +165,27 @@ void Error::one(const std::string &file, int line, const std::string &str)
 }
 
 /* ----------------------------------------------------------------------
-   forward vararg version to single string version
+   forward vararg versions to single string version
 ------------------------------------------------------------------------- */
 
-void Error::_all(const std::string &file, int line, fmt::string_view format,
+void Error::_all(const std::string &file, int line, int failed, fmt::string_view format,
                  fmt::format_args args)
 {
   try {
-    all(file,line,fmt::vformat(format, args));
+    all(file, line, failed, fmt::vformat(format, args));
   } catch (fmt::format_error &e) {
-    all(file,line,e.what());
+    all(file, line, NOPOINTER, e.what());
   }
   exit(1); // to trick "smart" compilers into believing this does not return
 }
 
-void Error::_one(const std::string &file, int line, fmt::string_view format,
+void Error::_one(const std::string &file, int line, int failed, fmt::string_view format,
                  fmt::format_args args)
 {
   try {
-    one(file,line,fmt::vformat(format, args));
+    one(file, line, failed, fmt::vformat(format, args));
   } catch (fmt::format_error &e) {
-    one(file,line,e.what());
+    one(file, line, NOPOINTER, e.what());
   }
   exit(1); // to trick "smart" compilers into believing this does not return
 }
diff --git a/src/error.h b/src/error.h
index 805bd4cd0d..0c446667e0 100644
--- a/src/error.h
+++ b/src/error.h
@@ -27,18 +27,50 @@ class Error : protected Pointers {
   [[noreturn]] void universe_one(const std::string &, int, const std::string &);
   void universe_warn(const std::string &, int, const std::string &);
 
-  [[noreturn]] void all(const std::string &, int, const std::string &);
-  template <typename... Args>
-  [[noreturn]] void all(const std::string &file, int line, const std::string &format, Args &&...args)
+  static constexpr int NOPOINTER = -2;
+  static constexpr int NOLASTLINE = -3;
+
+  // regular error calls
+
+  [[noreturn]] void all(const std::string &file, int line, const std::string &str)
   {
-    _all(file, line, format, fmt::make_format_args(args...));
+    all(file, line, NOPOINTER, str);
   }
 
-  [[noreturn]] void one(const std::string &, int, const std::string &);
   template <typename... Args>
-  [[noreturn]] void one(const std::string &file, int line, const std::string &format, Args &&...args)
+  [[noreturn]] void all(const std::string &file, int line, const std::string &format,
+                        Args &&...args)
   {
-    _one(file, line, format, fmt::make_format_args(args...));
+    _all(file, line, NOPOINTER, format, fmt::make_format_args(args...));
+  }
+
+  [[noreturn]] void one(const std::string &file, int line, const std::string &str)
+  {
+    one(file, line, NOPOINTER, str);
+  }
+
+  template <typename... Args>
+  [[noreturn]] void one(const std::string &file, int line, const std::string &format,
+                        Args &&...args)
+  {
+    _one(file, line, NOPOINTER, format, fmt::make_format_args(args...));
+  }
+
+  // overloaded error calls indicating faulty argument in command line
+  [[noreturn]] void all(const std::string &, int, int, const std::string &);
+  template <typename... Args>
+  [[noreturn]] void all(const std::string &file, int line, int failed, const std::string &format,
+                        Args &&...args)
+  {
+    _all(file, line, failed, format, fmt::make_format_args(args...));
+  }
+
+  [[noreturn]] void one(const std::string &, int, int, const std::string &);
+  template <typename... Args>
+  [[noreturn]] void one(const std::string &file, int line, int failed, const std::string &format,
+                        Args &&...args)
+  {
+    _one(file, line, failed, format, fmt::make_format_args(args...));
   }
 
   void warning(const std::string &, int, const std::string &);
@@ -72,8 +104,8 @@ class Error : protected Pointers {
 
   int numwarn, maxwarn, allwarn;
   // internal versions that accept explicit fmtlib arguments
-  [[noreturn]] void _all(const std::string &, int, fmt::string_view, fmt::format_args args);
-  [[noreturn]] void _one(const std::string &, int, fmt::string_view, fmt::format_args args);
+  [[noreturn]] void _all(const std::string &, int, int, fmt::string_view, fmt::format_args args);
+  [[noreturn]] void _one(const std::string &, int, int, fmt::string_view, fmt::format_args args);
   void _warning(const std::string &, int, fmt::string_view, fmt::format_args args);
   void _message(const std::string &, int, fmt::string_view, fmt::format_args args);
 };
diff --git a/src/fix.h b/src/fix.h
index 7609caf5fe..ebf5224171 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -264,6 +264,8 @@ class Fix : protected Pointers {
 
   virtual double memory_usage() { return 0.0; }
 
+  void set_copymode(int value) { copymode = value; }
+
  protected:
   int instance_me;    // which Fix class instantiation I am
 
diff --git a/src/fix_addforce.cpp b/src/fix_addforce.cpp
index 4920d57f4a..8625080ee9 100644
--- a/src/fix_addforce.cpp
+++ b/src/fix_addforce.cpp
@@ -85,11 +85,13 @@ FixAddForce::FixAddForce(LAMMPS *lmp, int narg, char **arg) :
       if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix addforce region", error);
       region = domain->get_region_by_id(arg[iarg + 1]);
       if (!region) error->all(FLERR, "Region {} for fix addforce does not exist", arg[iarg + 1]);
+      delete[] idregion;
       idregion = utils::strdup(arg[iarg + 1]);
       iarg += 2;
     } else if (strcmp(arg[iarg], "energy") == 0) {
       if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix addforce energy", error);
       if (utils::strmatch(arg[iarg + 1], "^v_")) {
+        delete[] estr;
         estr = utils::strdup(arg[iarg + 1] + 2);
       } else
         error->all(FLERR, "Invalid fix addforce energy argument: {}", arg[iarg + 1]);
diff --git a/src/fix_ave_chunk.cpp b/src/fix_ave_chunk.cpp
index 6a3c2e2032..f5dc36eac9 100644
--- a/src/fix_ave_chunk.cpp
+++ b/src/fix_ave_chunk.cpp
@@ -902,7 +902,7 @@ void FixAveChunk::end_of_step()
     if (overwrite) (void) platform::fseek(fp,filepos);
     double count = 0.0;
     for (m = 0; m < nchunk; m++) count += count_total[m];
-    fmt::print(fp,"{} {} {}\n",ntimestep,nchunk,count);
+    utils::print(fp,"{} {} {}\n",ntimestep,nchunk,count);
 
     int compress = cchunk->compress;
     int *chunkID = cchunk->chunkID;
diff --git a/src/fix_ave_correlate.cpp b/src/fix_ave_correlate.cpp
index 08cd673122..a99ffd3d68 100644
--- a/src/fix_ave_correlate.cpp
+++ b/src/fix_ave_correlate.cpp
@@ -466,7 +466,7 @@ void FixAveCorrelate::end_of_step()
   if (fp && comm->me == 0) {
     clearerr(fp);
     if (overwrite) platform::fseek(fp,filepos);
-    fmt::print(fp,"{} {}\n",ntimestep,nrepeat);
+    utils::print(fp,"{} {}\n",ntimestep,nrepeat);
     for (i = 0; i < nrepeat; i++) {
       fprintf(fp,"%d %d %d",i+1,i*nevery,count[i]);
       if (count[i])
diff --git a/src/fix_ave_grid.cpp b/src/fix_ave_grid.cpp
index 1b69c5644c..471d4191e7 100644
--- a/src/fix_ave_grid.cpp
+++ b/src/fix_ave_grid.cpp
@@ -42,19 +42,20 @@ enum{DISCARD,KEEP};
 
 static constexpr int OFFSET = 16384;
 
+// clang-format on
 /* ---------------------------------------------------------------------- */
 
 FixAveGrid::FixAveGrid(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg), id_bias(nullptr), which(nullptr), argindex(nullptr), ids(nullptr),
-  value2index(nullptr), value2grid(nullptr), value2data(nullptr), grid2d(nullptr), grid3d(nullptr),
-  grid_buf1(nullptr), grid_buf2(nullptr), grid_output(nullptr), grid_sample(nullptr),
-  grid_nfreq(nullptr), grid_running(nullptr), grid_window(nullptr), grid2d_previous(nullptr),
-  grid3d_previous(nullptr), grid_sample_previous(nullptr), grid_nfreq_previous(nullptr),
-  grid_running_previous(nullptr), grid_window_previous(nullptr), bin(nullptr), skip(nullptr),
-  vresult(nullptr)
+    Fix(lmp, narg, arg), id_bias(nullptr), which(nullptr), argindex(nullptr), ids(nullptr),
+    value2index(nullptr), value2grid(nullptr), value2data(nullptr), grid2d(nullptr),
+    grid3d(nullptr), grid_buf1(nullptr), grid_buf2(nullptr), grid_output(nullptr),
+    grid_sample(nullptr), grid_nfreq(nullptr), grid_running(nullptr), grid_window(nullptr),
+    grid2d_previous(nullptr), grid3d_previous(nullptr), grid_sample_previous(nullptr),
+    grid_nfreq_previous(nullptr), grid_running_previous(nullptr), grid_window_previous(nullptr),
+    bin(nullptr), skip(nullptr), vresult(nullptr)
 {
-  if (narg < 10) utils::missing_cmd_args(FLERR,"fix ave/grid", error);
-
+  if (narg < 10) utils::missing_cmd_args(FLERR, "fix ave/grid", error);
+  // clang-format off
   pergrid_flag = 1;
   nevery = utils::inumeric(FLERR,arg[3],false,lmp);
   nrepeat = utils::inumeric(FLERR,arg[4],false,lmp);
@@ -193,7 +194,6 @@ FixAveGrid::FixAveGrid(LAMMPS *lmp, int narg, char **arg) :
   aveflag = ONE;
   nwindow = 0;
   biasflag = 0;
-  id_bias = nullptr;
   adof = domain->dimension;
   cdof = 0.0;
 
@@ -231,6 +231,7 @@ FixAveGrid::FixAveGrid(LAMMPS *lmp, int narg, char **arg) :
       if (iarg+2 > nargnew)
         error->all(FLERR,"Illegal fix ave/grid command");
       biasflag = 1;
+      delete[] id_bias;
       id_bias = utils::strdup(arg[iarg+1]);
       iarg += 2;
 
@@ -347,11 +348,7 @@ FixAveGrid::FixAveGrid(LAMMPS *lmp, int narg, char **arg) :
   // vresult for per-atom variable evaluation
 
   maxatom = 0;
-  bin = nullptr;
-  skip = nullptr;
-
   maxvar = 0;
-  vresult = nullptr;
 
   // nvalid = next step on which end_of_step does something
   // add nvalid to all computes that store invocation times
@@ -372,6 +369,7 @@ FixAveGrid::~FixAveGrid()
   delete[] argindex;
   for (int m = 0; m < nvalues; m++) delete[] ids[m];
   delete[] ids;
+  delete[] id_bias;
   delete[] value2index;
   delete[] value2grid;
   delete[] value2data;
diff --git a/src/fix_ave_histo.cpp b/src/fix_ave_histo.cpp
index 35fd3fc4f9..53db1e13c6 100644
--- a/src/fix_ave_histo.cpp
+++ b/src/fix_ave_histo.cpp
@@ -717,7 +717,7 @@ void FixAveHisto::end_of_step()
   if (fp && comm->me == 0) {
     clearerr(fp);
     if (overwrite) (void) platform::fseek(fp,filepos);
-    fmt::print(fp,"{} {} {} {} {} {}\n",ntimestep,nbins,
+    utils::print(fp,"{} {} {} {} {} {}\n",ntimestep,nbins,
             stats_total[0],stats_total[1],stats_total[2],stats_total[3]);
     if (stats_total[0] != 0.0)
       for (int i = 0; i < nbins; i++)
diff --git a/src/fix_ave_histo_weight.cpp b/src/fix_ave_histo_weight.cpp
index c6f9b6ad22..e5a47881a5 100644
--- a/src/fix_ave_histo_weight.cpp
+++ b/src/fix_ave_histo_weight.cpp
@@ -454,7 +454,7 @@ void FixAveHistoWeight::end_of_step()
   if (fp && comm->me == 0) {
     clearerr(fp);
     if (overwrite) (void) platform::fseek(fp,filepos);
-    fmt::print(fp,"{} {} {} {} {} {}\n",ntimestep,nbins,
+    utils::print(fp,"{} {} {} {} {} {}\n",ntimestep,nbins,
             stats_total[0],stats_total[1],stats_total[2],stats_total[3]);
     if (stats_total[0] != 0.0)
       for (int i = 0; i < nbins; i++)
diff --git a/src/fix_ave_time.cpp b/src/fix_ave_time.cpp
index 72ff8ab6c1..ce6587dc27 100644
--- a/src/fix_ave_time.cpp
+++ b/src/fix_ave_time.cpp
@@ -60,7 +60,9 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
   // then read options so know mode = SCALAR/VECTOR before re-reading values
 
   nvalues = 0;
-  int iarg = 6;
+  // the first six arguments have fixed positions
+  const int ioffset = 6;
+  int iarg = ioffset;
   while (iarg < narg) {
     if (utils::strmatch(arg[iarg],"^[cfv]_")) {
       nvalues++;
@@ -68,9 +70,10 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
     } else break;
   }
   if (nvalues == 0)
-    error->all(FLERR,"No values from computes, fixes, or variables used in fix ave/time command");
+    error->all(FLERR, ioffset,
+               "No values from computes, fixes, or variables used in fix ave/time command");
 
-  // parse optional keywords
+  // parse optional keywords which must follow the data
 
   options(iarg,narg,arg);
 
@@ -79,10 +82,11 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
 
   int expand = 0;
   char **earg;
-  nvalues = utils::expand_args(FLERR,nvalues,&arg[6],mode,earg,lmp);
+  int *amap = nullptr;
+  nvalues = utils::expand_args(FLERR,nvalues,&arg[ioffset],mode,earg,lmp,&amap);
   key2col.clear();
 
-  if (earg != &arg[6]) expand = 1;
+  if (earg != &arg[ioffset]) expand = 1;
   arg = earg;
 
   // parse values
@@ -97,9 +101,11 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
     key2col[arg[i]] = i;
 
     if ((val.which == ArgInfo::NONE) || (val.which == ArgInfo::UNKNOWN) || (argi.get_dim() > 1))
-      error->all(FLERR,"Invalid fix ave/time argument: {}", arg[i]);
+      error->all(FLERR, amap[i] + ioffset,"Invalid fix ave/time argument: {}", arg[i]);
 
     val.argindex = argi.get_index1();
+    if (expand) val.iarg = amap[i] + ioffset;
+    else val.iarg = i + ioffset;
     val.varlen = 0;
     val.offcol = 0;
     val.id = argi.get_name();
@@ -115,16 +121,16 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
   for (int i = 0; i < noff; i++) {
     if (offlist[i] < 1 || offlist[i] > nvalues)
       error->all(FLERR,"Invalid fix ave/time off column: {}", offlist[i]);
-    values[offlist[i]-1].offcol = 1;
+    values[offlist[i] - 1].offcol = 1;
   }
 
   // setup and error check
   // for fix inputs, check that fix frequency is acceptable
   // set variable_length if any compute is variable length
 
-  if (nevery <= 0) error->all(FLERR,"Illegal fix ave/time nevery value: {}", nevery);
-  if (nrepeat <= 0) error->all(FLERR,"Illegal fix ave/time nrepeat value: {}", nrepeat);
-  if (nfreq <= 0) error->all(FLERR,"Illegal fix ave/time nfreq value: {}", nfreq);
+  if (nevery <= 0) error->all(FLERR, 3, "Illegal fix ave/time nevery value: {}", nevery);
+  if (nrepeat <= 0) error->all(FLERR, 4, "Illegal fix ave/time nrepeat value: {}", nrepeat);
+  if (nfreq <= 0) error->all(FLERR, 5, "Illegal fix ave/time nfreq value: {}", nfreq);
   if (nfreq % nevery || nrepeat*nevery > nfreq)
     error->all(FLERR,"Inconsistent fix ave/time nevery/nrepeat/nfreq values");
   if (ave != RUNNING && overwrite)
@@ -134,25 +140,29 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
 
     if ((val.which == ArgInfo::COMPUTE) && (mode == SCALAR)) {
       val.val.c = modify->get_compute_by_id(val.id);
-      if (!val.val.c) error->all(FLERR,"Compute ID {} for fix ave/time does not exist", val.id);
+      if (!val.val.c)
+        error->all(FLERR, val.iarg, "Compute ID {} for fix ave/time does not exist", val.id);
       if (val.argindex == 0 && (val.val.c->scalar_flag == 0))
-        error->all(FLERR,"Fix ave/time compute {} does not calculate a scalar", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} does not calculate a scalar", val.id);
       if (val.argindex && (val.val.c->vector_flag == 0))
-        error->all(FLERR,"Fix ave/time compute {} does not calculate a vector", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} does not calculate a vector", val.id);
       if (val.argindex && (val.argindex > val.val.c->size_vector) &&
           (val.val.c->size_vector_variable == 0))
-        error->all(FLERR, "Fix ave/time compute {} vector is accessed out-of-range", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} vector is accessed out-of-range",
+                   val.id);
       if (val.argindex && val.val.c->size_vector_variable) val.varlen = 1;
 
     } else if ((val.which == ArgInfo::COMPUTE) && (mode == VECTOR)) {
       val.val.c = modify->get_compute_by_id(val.id);
-      if (!val.val.c) error->all(FLERR,"Compute ID {} for fix ave/time does not exist", val.id);
+      if (!val.val.c)
+        error->all(FLERR, val.iarg, "Compute ID {} for fix ave/time does not exist", val.id);
       if ((val.argindex == 0) && (val.val.c->vector_flag == 0))
-        error->all(FLERR,"Fix ave/time compute {} does not calculate a vector", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} does not calculate a vector", val.id);
       if (val.argindex && (val.val.c->array_flag == 0))
-        error->all(FLERR,"Fix ave/time compute {} does not calculate an array", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} does not calculate an array", val.id);
       if (val.argindex && (val.argindex > val.val.c->size_array_cols))
-        error->all(FLERR,"Fix ave/time compute {} array is accessed out-of-range", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time compute {} array is accessed out-of-range",
+                   val.id);
       if ((val.argindex == 0) && (val.val.c->size_vector_variable)) val.varlen = 1;
       if (val.argindex && (val.val.c->size_array_rows_variable)) val.varlen = 1;
 
@@ -160,47 +170,54 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
       val.val.f = modify->get_fix_by_id(val.id);
       if (!val.val.f) error->all(FLERR,"Fix ID {} for fix ave/time does not exist", val.id);
       if ((val.argindex == 0) && (val.val.f->scalar_flag == 0))
-        error->all(FLERR,"Fix ave/time fix {} does not calculate a scalar", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} does not calculate a scalar", val.id);
       if (val.argindex && (val.val.f->vector_flag == 0))
-        error->all(FLERR,"Fix ave/time fix {} does not calculate a vector", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} does not calculate a vector", val.id);
       if (val.argindex && (val.val.f->size_vector_variable))
-        error->all(FLERR,"Fix ave/time fix {} vector cannot be variable length", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} vector cannot be variable length", val.id);
       if (val.argindex && (val.argindex > val.val.f->size_vector))
-        error->all(FLERR,"Fix ave/time fix {} vector is accessed out-of-range", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} vector is accessed out-of-range", val.id);
       if (nevery % val.val.f->global_freq)
-        error->all(FLERR, "Fix {} for fix ave/time not computed at compatible time", val.id);
+        error->all(FLERR, val.iarg, "Fix {} for fix ave/time not computed at compatible time",
+                   val.id);
 
     } else if ((val.which == ArgInfo::FIX) && (mode == VECTOR)) {
       val.val.f = modify->get_fix_by_id(val.id);
-      if (!val.val.f) error->all(FLERR,"Fix ID {} for fix ave/time does not exist", val.id);
+      if (!val.val.f)
+        error->all(FLERR, val.iarg, "Fix ID {} for fix ave/time does not exist", val.id);
       if ((val.argindex == 0) && (val.val.f->vector_flag == 0))
-        error->all(FLERR,"Fix ave/time fix {} does not calculate a vector", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} does not calculate a vector", val.id);
       if (val.argindex && (val.val.f->array_flag == 0))
-        error->all(FLERR,"Fix ave/time fix {} does not calculate an array", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} does not calculate an array", val.id);
       if (val.argindex && (val.val.f->size_array_rows_variable))
-        error->all(FLERR,"Fix ave/time fix {} array cannot have variable row length", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} array cannot have variable row length",
+                   val.id);
       if (val.argindex && (val.argindex > val.val.f->size_array_cols))
-        error->all(FLERR,"Fix ave/time fix {} array is accessed out-of-range", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time fix {} array is accessed out-of-range", val.id);
       if (nevery % val.val.f->global_freq)
-        error->all(FLERR, "Fix {} for fix ave/time not computed at compatible time", val.id);
+        error->all(FLERR, val.iarg, "Fix {} for fix ave/time not computed at compatible time",
+                   val.id);
 
     } else if ((val.which == ArgInfo::VARIABLE) && (mode == SCALAR)) {
       int ivariable = input->variable->find(val.id.c_str());
       if (ivariable < 0)
-        error->all(FLERR,"Variable name {} for fix ave/time does not exist", val.id);
+        error->all(FLERR, val.iarg, "Variable name {} for fix ave/time does not exist", val.id);
       if ((val.argindex == 0) && (input->variable->equalstyle(ivariable) == 0))
-        error->all(FLERR,"Fix ave/time variable {} is not equal-style variable", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time variable {} is not equal-style variable", val.id);
       if ((val.argindex) && (input->variable->vectorstyle(ivariable) == 0))
-        error->all(FLERR,"Fix ave/time variable {} is not vector-style variable", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time variable {} is not vector-style variable",
+                   val.id);
 
     } else if ((val.which == ArgInfo::VARIABLE) && (mode == VECTOR)) {
       int ivariable = input->variable->find(val.id.c_str());
       if (ivariable < 0)
-        error->all(FLERR,"Variable name {} for fix ave/time does not exist", val.id);
+        error->all(FLERR, val.iarg, "Variable name {} for fix ave/time does not exist", val.id);
       if ((val.argindex == 0) && (input->variable->vectorstyle(ivariable) == 0))
-        error->all(FLERR,"Fix ave/time variable {} is not vector-style variable", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time variable {} is not vector-style variable",
+                   val.id);
       if (val.argindex)
-        error->all(FLERR,"Fix ave/time mode vector variable {} cannot be indexed", val.id);
+        error->all(FLERR, val.iarg, "Fix ave/time mode vector variable {} cannot be indexed",
+                   val.id);
       val.varlen = 1;
     }
   }
@@ -258,7 +275,9 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
       fprintf(fp,"\n");
     }
     if (yaml_flag) fputs("---\n",fp);
-    if (ferror(fp)) error->one(FLERR,"Error writing file header: {}", utils::getsyserror());
+    if (ferror(fp))
+      error->one(FLERR, Error::NOLASTLINE, "Error writing fix ave/time ID {} file header: {}",
+                 id, utils::getsyserror());
     filepos = platform::ftell(fp);
   }
 
@@ -272,6 +291,7 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
   if (expand) {
     for (int i = 0; i < nvalues; i++) delete[] earg[i];
     memory->sfree(earg);
+    memory->sfree(amap);
   }
 
   // allocate memory for averaging
@@ -377,12 +397,12 @@ FixAveTime::FixAveTime(LAMMPS *lmp, int narg, char **arg) :
           extvalue = 0;
         }
         if (extvalue == -1)
-          error->all(FLERR,"Fix ave/time cannot set output array intensive/extensive "
-                     "from these inputs");
+          error->all(FLERR, Error::NOLASTLINE, "Fix ave/time cannot set output array "
+                     "intensive/extensive from these inputs");
         if (extarray < -1) extarray = extvalue;
         else if (extvalue != extarray)
-          error->all(FLERR,"Fix ave/time cannot set output array intensive/extensive "
-                     "from these inputs");
+          error->all(FLERR, Error::NOLASTLINE, "Fix ave/time cannot set output array "
+                     "intensive/extensive from these inputs");
       }
     }
   }
@@ -462,15 +482,17 @@ void FixAveTime::init()
     if (val.which == ArgInfo::COMPUTE) {
       val.val.c = modify->get_compute_by_id(val.id);
       if (!val.val.c)
-        error->all(FLERR,"Compute ID {} for fix ave/time does not exist", val.id);
+        error->all(FLERR, Error::NOLASTLINE, "Compute ID {} for fix ave/time does not exist",
+                   val.id);
     } else if (val.which == ArgInfo::FIX) {
       val.val.f = modify->get_fix_by_id(val.id);
       if (!val.val.f)
-        error->all(FLERR,"Fix ID {} for fix ave/time does not exist", val.id);
+        error->all(FLERR, Error::NOLASTLINE, "Fix ID {} for fix ave/time does not exist", val.id);
     } else if (val.which == ArgInfo::VARIABLE) {
       val.val.v = input->variable->find(val.id.c_str());
       if (val.val.v < 0)
-        error->all(FLERR,"Variable name {} for fix ave/time does not exist", val.id);
+        error->all(FLERR, Error::NOLASTLINE, "Variable name {} for fix ave/time does not exist",
+                   val.id);
     }
   }
 
@@ -645,24 +667,26 @@ void FixAveTime::invoke_scalar(bigint ntimestep)
       if (!yaml_header || overwrite) {
         yaml_header = true;
         fputs("keywords: ['Step', ", fp);
-        for (const auto &val : values) fmt::print(fp, "'{}', ", val.keyword);
+        for (const auto &val : values) utils::print(fp, "'{}', ", val.keyword);
         fputs("]\ndata:\n", fp);
       }
-      fmt::print(fp, "  - [{}, ", ntimestep);
-      for (i = 0; i < nvalues; i++) fmt::print(fp,"{}, ",vector_total[i]/norm);
+      utils::print(fp, "  - [{}, ", ntimestep);
+      for (i = 0; i < nvalues; i++) utils::print(fp,"{}, ",vector_total[i]/norm);
       fputs("]\n", fp);
     } else {
-      fmt::print(fp,"{}",ntimestep);
+      utils::print(fp,"{}",ntimestep);
       for (i = 0; i < nvalues; i++) fprintf(fp,format,vector_total[i]/norm);
       fprintf(fp,"\n");
-      if (ferror(fp)) error->one(FLERR,"Error writing out time averaged data");
+      if (ferror(fp))
+        error->one(FLERR, Error::NOLASTLINE, "Error writing out time averaged data: {}",
+                   utils::getsyserror());
     }
     fflush(fp);
 
     if (overwrite) {
       bigint fileend = platform::ftell(fp);
       if ((fileend > 0) && (platform::ftruncate(fp,fileend)))
-        error->warning(FLERR,"Error while tuncating output: {}", utils::getsyserror());
+        error->warning(FLERR, "Error while tuncating output: {}", utils::getsyserror());
     }
   }
 }
@@ -767,7 +791,8 @@ void FixAveTime::invoke_vector(bigint ntimestep)
       double *varvec;
       int nvec = input->variable->compute_vector(val.val.v,&varvec);
       if (nvec != nrows)
-        error->all(FLERR,"Fix ave/time vector-style variable {} changed length", val.id);
+        error->all(FLERR, Error::NOLASTLINE, "Fix ave/time vector-style variable {} changed length",
+                   val.id);
       for (int i = 0; i < nrows; i++)
         column[i] = varvec[i];
     }
@@ -860,17 +885,17 @@ void FixAveTime::invoke_vector(bigint ntimestep)
       if (!yaml_header || overwrite) {
         yaml_header = true;
         fputs("keywords: [", fp);
-        for (const auto &val : values) fmt::print(fp, "'{}', ", val.keyword);
+        for (const auto &val : values) utils::print(fp, "'{}', ", val.keyword);
         fputs("]\ndata:\n", fp);
       }
-      fmt::print(fp, "  {}:\n", ntimestep);
+      utils::print(fp, "  {}:\n", ntimestep);
       for (int i = 0; i < nrows; i++) {
         fputs("  - [", fp);
-        for (int j = 0; j < nvalues; j++) fmt::print(fp,"{}, ",array_total[i][j]/norm);
+        for (int j = 0; j < nvalues; j++) utils::print(fp,"{}, ",array_total[i][j]/norm);
         fputs("]\n", fp);
       }
     } else {
-      fmt::print(fp,"{} {}\n",ntimestep,nrows);
+      utils::print(fp,"{} {}\n",ntimestep,nrows);
       for (int i = 0; i < nrows; i++) {
         fprintf(fp,"%d",i+1);
         for (int j = 0; j < nvalues; j++) fprintf(fp,format,array_total[i][j]/norm);
@@ -881,7 +906,7 @@ void FixAveTime::invoke_vector(bigint ntimestep)
     if (overwrite) {
       bigint fileend = platform::ftell(fp);
       if ((fileend > 0) && (platform::ftruncate(fp,fileend)))
-        error->warning(FLERR,"Error while tuncating output: {}", utils::getsyserror());
+        error->warning(FLERR, "Error while tuncating output: {}", utils::getsyserror());
     }
   }
 }
@@ -912,7 +937,7 @@ int FixAveTime::column_length(int dynamic)
       }
       if (length == 0) length = lengthone;
       else if (lengthone != length)
-        error->all(FLERR,"Fix ave/time columns are inconsistent lengths");
+        error->all(FLERR, Error::NOLASTLINE, "Fix ave/time columns have inconsistent lengths");
     }
   }
 
@@ -935,10 +960,10 @@ int FixAveTime::column_length(int dynamic)
       if (all_variable_length) {
         if (length == 0) length = lengthone;
         else if (lengthone != length)
-          error->all(FLERR,"Fix ave/time columns are inconsistent lengths");
+          error->all(FLERR, Error::NOLASTLINE, "Fix ave/time columns have inconsistent lengths");
       } else {
         if (lengthone != nrows)
-          error->all(FLERR,"Fix ave/time columns are inconsistent lengths");
+          error->all(FLERR, Error::NOLASTLINE, "Fix ave/time columns have inconsistent lengths");
       }
     }
   }
@@ -1002,7 +1027,7 @@ int FixAveTime::modify_param(int narg, char **arg)
       }
     }
     if ((icol < 0) || (icol >= (int) values.size()))
-      error->all(FLERR, "Thermo_modify colname column {} invalid", arg[1]);
+      error->all(FLERR, 1 + 1, "Thermo_modify colname column {} invalid", arg[1]);
     values[icol].keyword = arg[2];
     return 3;
   }
@@ -1042,7 +1067,7 @@ void FixAveTime::options(int iarg, int narg, char **arg)
         if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
         else fp = fopen(arg[iarg+1],"a");
         if (fp == nullptr)
-          error->one(FLERR,"Cannot open fix ave/time file {}: {}",
+          error->one(FLERR, iarg+1, "Cannot open fix ave/time file {}: {}",
                      arg[iarg+1], utils::getsyserror());
       }
       iarg += 2;
@@ -1051,12 +1076,13 @@ void FixAveTime::options(int iarg, int narg, char **arg)
       if (strcmp(arg[iarg+1],"one") == 0) ave = ONE;
       else if (strcmp(arg[iarg+1],"running") == 0) ave = RUNNING;
       else if (strcmp(arg[iarg+1],"window") == 0) ave = WINDOW;
-      else error->all(FLERR,"Unknown fix ave/time ave keyword {}", arg[iarg+1]);
+      else error->all(FLERR, iarg+1, "Unknown fix ave/time ave keyword {}", arg[iarg+1]);
       if (ave == WINDOW) {
         if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "fix ave/time ave window", error);
         nwindow = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
         if (nwindow <= 0)
-          error->all(FLERR,"Illegal fix ave/time ave window argument {}; must be > 0", nwindow);
+          error->all(FLERR, iarg+2, "Illegal fix ave/time ave window argument {}; must be > 0",
+                     nwindow);
       }
       iarg += 2;
       if (ave == WINDOW) iarg++;
@@ -1068,7 +1094,7 @@ void FixAveTime::options(int iarg, int narg, char **arg)
       if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time mode", error);
       if (strcmp(arg[iarg+1],"scalar") == 0) mode = SCALAR;
       else if (strcmp(arg[iarg+1],"vector") == 0) mode = VECTOR;
-      else error->all(FLERR,"Unknown fix ave/time mode {}", arg[iarg+1]);
+      else error->all(FLERR,iarg+1,"Unknown fix ave/time mode {}", arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"off") == 0) {
       if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time off", error);
diff --git a/src/fix_ave_time.h b/src/fix_ave_time.h
index 5b25a68ab5..5aedc4443d 100644
--- a/src/fix_ave_time.h
+++ b/src/fix_ave_time.h
@@ -43,6 +43,7 @@ class FixAveTime : public Fix {
   struct value_t {
     int which;       // type of data: COMPUTE, FIX, VARIABLE
     int argindex;    // 1-based index if data is vector, else 0
+    int iarg;        // argument index in original argument list
     int varlen;      // 1 if value is from variable-length compute
     int offcol;
     std::string id;         // compute/fix/variable ID
diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp
index 4258e3359f..60a55bbbb4 100644
--- a/src/fix_langevin.cpp
+++ b/src/fix_langevin.cpp
@@ -191,6 +191,8 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
 
 FixLangevin::~FixLangevin()
 {
+  if (copymode) return;
+
   delete random;
   delete[] tstr;
   delete[] gfactor1;
@@ -509,7 +511,7 @@ void FixLangevin::post_force(int /*vflag*/)
             else          post_force_templated<1,0,0,0,0,0>();
   else
     if (gjfflag)
-      if (tallyflag  || osflag)
+      if (tallyflag || osflag)
         if (tbiasflag == BIAS)
           if (rmass)
             if (zeroflag) post_force_templated<0,1,1,1,1,1>();
diff --git a/src/fix_print.cpp b/src/fix_print.cpp
index ccef03c3ae..1d60addfa4 100644
--- a/src/fix_print.cpp
+++ b/src/fix_print.cpp
@@ -38,7 +38,7 @@ FixPrint::FixPrint(LAMMPS *lmp, int narg, char **arg) :
     nevery = 1;
   } else {
     nevery = utils::inumeric(FLERR, arg[3], false, lmp);
-    if (nevery <= 0) error->all(FLERR, "Illegal fix print nevery value {}; must be > 0", nevery);
+    if (nevery <= 0) error->all(FLERR, 3, "Illegal fix print nevery value {}; must be > 0", nevery);
   }
 
   text = utils::strdup(arg[4]);
@@ -121,12 +121,15 @@ void FixPrint::init()
   if (var_print) {
     ivar_print = input->variable->find(var_print);
     if (ivar_print < 0)
-      error->all(FLERR, "Variable {} for fix print timestep does not exist", var_print);
+      error->all(FLERR, Error::NOLASTLINE, "Variable {} for fix print timestep does not exist",
+                 var_print);
     if (!input->variable->equalstyle(ivar_print))
-      error->all(FLERR, "Variable {} for fix print timestep is invalid style", var_print);
+      error->all(FLERR, Error::NOLASTLINE, "Variable {} for fix print timestep is invalid style",
+                 var_print);
     next_print = static_cast<bigint>(input->variable->compute_equal(ivar_print));
     if (next_print <= update->ntimestep)
-      error->all(FLERR, "Fix print timestep variable {} returned a bad timestep: {}", var_print,
+      error->all(FLERR, Error::NOLASTLINE,
+                 "Fix print timestep variable {} returned a bad timestep: {}", var_print,
                  next_print);
   } else {
     if (update->ntimestep % nevery)
@@ -178,7 +181,7 @@ void FixPrint::end_of_step()
   if (comm->me == 0) {
     if (screenflag) utils::logmesg(lmp, std::string(copy) + "\n");
     if (fp) {
-      fmt::print(fp, "{}\n", copy);
+      utils::print(fp, "{}\n", copy);
       fflush(fp);
     }
   }
diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index f88d6551b4..d1cc65adad 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -543,7 +543,7 @@ void FixPropertyAtom::write_data_section(int /*mth*/, FILE *fp, int n, double **
         icol += ncol;
       }
     }
-    fmt::print(fp, line + "\n");
+    utils::print(fp, line + "\n");
   }
 }
 
diff --git a/src/fix_restrain.cpp b/src/fix_restrain.cpp
index cc95fc93f3..9b157ee90f 100644
--- a/src/fix_restrain.cpp
+++ b/src/fix_restrain.cpp
@@ -616,7 +616,7 @@ void FixRestrain::restrain_dihedral(int m)
                                     me,update->ntimestep,atom->tag[i1],
                                     atom->tag[i2],atom->tag[i3],atom->tag[i4]);
       error->warning(FLERR,str);
-      fmt::print(screen,"  1st atom: {} {} {} {}\n"
+      utils::print(screen,"  1st atom: {} {} {} {}\n"
                  "  2nd atom: {} {} {} {}\n"
                  "  3rd atom: {} {} {} {}\n"
                  "  4th atom: {} {} {} {}\n",
diff --git a/src/fmt/args.h b/src/fmt/args.h
index 3ff4788074..b77a2d0661 100644
--- a/src/fmt/args.h
+++ b/src/fmt/args.h
@@ -8,15 +8,14 @@
 #ifndef FMT_ARGS_H_
 #define FMT_ARGS_H_
 
-#ifndef FMT_MODULE
-#  include <functional>  // std::reference_wrapper
-#  include <memory>      // std::unique_ptr
-#  include <vector>
-#endif
+#include <functional>  // std::reference_wrapper
+#include <memory>      // std::unique_ptr
+#include <vector>
 
 #include "format.h"  // std_string_view
 
 FMT_BEGIN_NAMESPACE
+
 namespace detail {
 
 template <typename T> struct is_reference_wrapper : std::false_type {};
@@ -29,18 +28,15 @@ auto unwrap(const std::reference_wrapper<T>& v) -> const T& {
   return static_cast<const T&>(v);
 }
 
-// node is defined outside dynamic_arg_list to workaround a C2504 bug in MSVC
-// 2022 (v17.10.0).
-//
-// Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
-// templates it doesn't complain about inability to deduce single translation
-// unit for placing vtable. So node is made a fake template.
-template <typename = void> struct node {
-  virtual ~node() = default;
-  std::unique_ptr<node<>> next;
-};
-
 class dynamic_arg_list {
+  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+  // templates it doesn't complain about inability to deduce single translation
+  // unit for placing vtable. So storage_node_base is made a fake template.
+  template <typename = void> struct node {
+    virtual ~node() = default;
+    std::unique_ptr<node<>> next;
+  };
+
   template <typename T> struct typed_node : node<> {
     T value;
 
@@ -66,18 +62,28 @@ class dynamic_arg_list {
 }  // namespace detail
 
 /**
- * A dynamic list of formatting arguments with storage.
- *
- * It can be implicitly converted into `fmt::basic_format_args` for passing
- * into type-erased formatting functions such as `fmt::vformat`.
+  \rst
+  A dynamic version of `fmt::format_arg_store`.
+  It's equipped with a storage to potentially temporary objects which lifetimes
+  could be shorter than the format arguments object.
+
+  It can be implicitly converted into `~fmt::basic_format_args` for passing
+  into type-erased formatting functions such as `~fmt::vformat`.
+  \endrst
  */
-template <typename Context> class dynamic_format_arg_store {
+template <typename Context>
+class dynamic_format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
  private:
   using char_type = typename Context::char_type;
 
   template <typename T> struct need_copy {
     static constexpr detail::type mapped_type =
-        detail::mapped_type_constant<T, char_type>::value;
+        detail::mapped_type_constant<T, Context>::value;
 
     enum {
       value = !(detail::is_reference_wrapper<T>::value ||
@@ -90,7 +96,7 @@ template <typename Context> class dynamic_format_arg_store {
   };
 
   template <typename T>
-  using stored_t = conditional_t<
+  using stored_type = conditional_t<
       std::is_convertible<T, std::basic_string<char_type>>::value &&
           !detail::is_reference_wrapper<T>::value,
       std::basic_string<char_type>, T>;
@@ -105,72 +111,80 @@ template <typename Context> class dynamic_format_arg_store {
 
   friend class basic_format_args<Context>;
 
+  auto get_types() const -> unsigned long long {
+    return detail::is_unpacked_bit | data_.size() |
+           (named_info_.empty()
+                ? 0ULL
+                : static_cast<unsigned long long>(detail::has_named_args_bit));
+  }
+
   auto data() const -> const basic_format_arg<Context>* {
     return named_info_.empty() ? data_.data() : data_.data() + 1;
   }
 
   template <typename T> void emplace_arg(const T& arg) {
-    data_.emplace_back(arg);
+    data_.emplace_back(detail::make_arg<Context>(arg));
   }
 
   template <typename T>
   void emplace_arg(const detail::named_arg<char_type, T>& arg) {
-    if (named_info_.empty())
-      data_.insert(data_.begin(), basic_format_arg<Context>(nullptr, 0));
-    data_.emplace_back(detail::unwrap(arg.value));
+    if (named_info_.empty()) {
+      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
+      data_.insert(data_.begin(), {zero_ptr, 0});
+    }
+    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
     auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
       data->pop_back();
     };
     std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
         guard{&data_, pop_one};
     named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
-    data_[0] = {named_info_.data(), named_info_.size()};
+    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
     guard.release();
   }
 
  public:
   constexpr dynamic_format_arg_store() = default;
 
-  operator basic_format_args<Context>() const {
-    return basic_format_args<Context>(data(), static_cast<int>(data_.size()),
-                                      !named_info_.empty());
-  }
-
   /**
-   * Adds an argument into the dynamic store for later passing to a formatting
-   * function.
-   *
-   * Note that custom types and string types (but not string views) are copied
-   * into the store dynamically allocating memory if necessary.
-   *
-   * **Example**:
-   *
-   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
-   *     store.push_back(42);
-   *     store.push_back("abc");
-   *     store.push_back(1.5f);
-   *     std::string result = fmt::vformat("{} and {} and {}", store);
-   */
+    \rst
+    Adds an argument into the dynamic store for later passing to a formatting
+    function.
+
+    Note that custom types and string types (but not string views) are copied
+    into the store dynamically allocating memory if necessary.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      store.push_back(42);
+      store.push_back("abc");
+      store.push_back(1.5f);
+      std::string result = fmt::vformat("{} and {} and {}", store);
+    \endrst
+  */
   template <typename T> void push_back(const T& arg) {
     if (detail::const_check(need_copy<T>::value))
-      emplace_arg(dynamic_args_.push<stored_t<T>>(arg));
+      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
     else
       emplace_arg(detail::unwrap(arg));
   }
 
   /**
-   * Adds a reference to the argument into the dynamic store for later passing
-   * to a formatting function.
-   *
-   * **Example**:
-   *
-   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
-   *     char band[] = "Rolling Stones";
-   *     store.push_back(std::cref(band));
-   *     band[9] = 'c'; // Changing str affects the output.
-   *     std::string result = fmt::vformat("{}", store);
-   *     // result == "Rolling Scones"
-   */
+    \rst
+    Adds a reference to the argument into the dynamic store for later passing to
+    a formatting function.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      char band[] = "Rolling Stones";
+      store.push_back(std::cref(band));
+      band[9] = 'c'; // Changing str affects the output.
+      std::string result = fmt::vformat("{}", store);
+      // result == "Rolling Scones"
+    \endrst
+  */
   template <typename T> void push_back(std::reference_wrapper<T> arg) {
     static_assert(
         need_copy<T>::value,
@@ -179,40 +193,41 @@ template <typename Context> class dynamic_format_arg_store {
   }
 
   /**
-   * Adds named argument into the dynamic store for later passing to a
-   * formatting function. `std::reference_wrapper` is supported to avoid
-   * copying of the argument. The name is always copied into the store.
-   */
+    Adds named argument into the dynamic store for later passing to a formatting
+    function. ``std::reference_wrapper`` is supported to avoid copying of the
+    argument. The name is always copied into the store.
+  */
   template <typename T>
   void push_back(const detail::named_arg<char_type, T>& arg) {
     const char_type* arg_name =
         dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
     if (detail::const_check(need_copy<T>::value)) {
       emplace_arg(
-          fmt::arg(arg_name, dynamic_args_.push<stored_t<T>>(arg.value)));
+          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
     } else {
       emplace_arg(fmt::arg(arg_name, arg.value));
     }
   }
 
-  /// Erase all elements from the store.
+  /** Erase all elements from the store */
   void clear() {
     data_.clear();
     named_info_.clear();
-    dynamic_args_ = {};
+    dynamic_args_ = detail::dynamic_arg_list();
   }
 
-  /// Reserves space to store at least `new_cap` arguments including
-  /// `new_cap_named` named arguments.
+  /**
+    \rst
+    Reserves space to store at least *new_cap* arguments including
+    *new_cap_named* named arguments.
+    \endrst
+  */
   void reserve(size_t new_cap, size_t new_cap_named) {
     FMT_ASSERT(new_cap >= new_cap_named,
-               "set of arguments includes set of named arguments");
+               "Set of arguments includes set of named arguments");
     data_.reserve(new_cap);
     named_info_.reserve(new_cap_named);
   }
-
-  /// Returns the number of elements in the store.
-  size_t size() const noexcept { return data_.size(); }
 };
 
 FMT_END_NAMESPACE
diff --git a/src/fmt/base.h b/src/fmt/base.h
deleted file mode 100644
index efa957d8d2..0000000000
--- a/src/fmt/base.h
+++ /dev/null
@@ -1,2970 +0,0 @@
-// Formatting library for C++ - the base API for char/UTF-8
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_BASE_H_
-#define FMT_BASE_H_
-
-#if defined(FMT_IMPORT_STD) && !defined(FMT_MODULE)
-#  define FMT_MODULE
-#endif
-
-#ifndef FMT_MODULE
-#  include <limits.h>  // CHAR_BIT
-#  include <stdio.h>   // FILE
-#  include <string.h>  // memcmp
-
-#  include <type_traits>  // std::enable_if
-#endif
-
-// The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 110102
-
-// Detect compiler versions.
-#if defined(__clang__) && !defined(__ibmxl__)
-#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
-#else
-#  define FMT_CLANG_VERSION 0
-#endif
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#  define FMT_GCC_VERSION 0
-#endif
-#if defined(__ICL)
-#  define FMT_ICC_VERSION __ICL
-#elif defined(__INTEL_COMPILER)
-#  define FMT_ICC_VERSION __INTEL_COMPILER
-#else
-#  define FMT_ICC_VERSION 0
-#endif
-#if defined(_MSC_VER)
-#  define FMT_MSC_VERSION _MSC_VER
-#else
-#  define FMT_MSC_VERSION 0
-#endif
-
-// Detect standard library versions.
-#ifdef _GLIBCXX_RELEASE
-#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
-#else
-#  define FMT_GLIBCXX_RELEASE 0
-#endif
-#ifdef _LIBCPP_VERSION
-#  define FMT_LIBCPP_VERSION _LIBCPP_VERSION
-#else
-#  define FMT_LIBCPP_VERSION 0
-#endif
-
-#ifdef _MSVC_LANG
-#  define FMT_CPLUSPLUS _MSVC_LANG
-#else
-#  define FMT_CPLUSPLUS __cplusplus
-#endif
-
-// Detect __has_*.
-#ifdef __has_feature
-#  define FMT_HAS_FEATURE(x) __has_feature(x)
-#else
-#  define FMT_HAS_FEATURE(x) 0
-#endif
-#ifdef __has_include
-#  define FMT_HAS_INCLUDE(x) __has_include(x)
-#else
-#  define FMT_HAS_INCLUDE(x) 0
-#endif
-#ifdef __has_builtin
-#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#  define FMT_HAS_BUILTIN(x) 0
-#endif
-#ifdef __has_cpp_attribute
-#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
-  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
-  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-// Detect C++14 relaxed constexpr.
-#ifdef FMT_USE_CONSTEXPR
-// Use the provided definition.
-#elif FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L
-// GCC only allows throw in constexpr since version 6:
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67371.
-#  define FMT_USE_CONSTEXPR 1
-#elif FMT_ICC_VERSION
-#  define FMT_USE_CONSTEXPR 0  // https://github.com/fmtlib/fmt/issues/1628
-#elif FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912
-#  define FMT_USE_CONSTEXPR 1
-#else
-#  define FMT_USE_CONSTEXPR 0
-#endif
-#if FMT_USE_CONSTEXPR
-#  define FMT_CONSTEXPR constexpr
-#else
-#  define FMT_CONSTEXPR
-#endif
-
-// Detect consteval, C++20 constexpr extensions and std::is_constant_evaluated.
-// LAMMPS customization. Using consteval is not compatible with how we use libfmt
-#if 1
-#  define FMT_USE_CONSTEVAL 0
-#elif !defined(__cpp_lib_is_constant_evaluated)
-#  define FMT_USE_CONSTEVAL 0
-#elif FMT_CPLUSPLUS < 201709L
-#  define FMT_USE_CONSTEVAL 0
-#elif FMT_GLIBCXX_RELEASE && FMT_GLIBCXX_RELEASE < 10
-#  define FMT_USE_CONSTEVAL 0
-#elif FMT_LIBCPP_VERSION && FMT_LIBCPP_VERSION < 10000
-#  define FMT_USE_CONSTEVAL 0
-#elif defined(__apple_build_version__) && __apple_build_version__ < 14000029L
-#  define FMT_USE_CONSTEVAL 0  // consteval is broken in Apple clang < 14.
-#elif FMT_MSC_VERSION && FMT_MSC_VERSION < 1929
-#  define FMT_USE_CONSTEVAL 0  // consteval is broken in MSVC VS2019 < 16.10.
-#elif defined(__cpp_consteval)
-#  define FMT_USE_CONSTEVAL 1
-#elif FMT_GCC_VERSION >= 1002 || FMT_CLANG_VERSION >= 1101
-#  define FMT_USE_CONSTEVAL 1
-#else
-#  define FMT_USE_CONSTEVAL 0
-#endif
-#if FMT_USE_CONSTEVAL
-#  define FMT_CONSTEVAL consteval
-#  define FMT_CONSTEXPR20 constexpr
-#else
-#  define FMT_CONSTEVAL
-#  define FMT_CONSTEXPR20
-#endif
-
-// Check if exceptions are disabled.
-#ifdef FMT_USE_EXCEPTIONS
-// Use the provided definition.
-#elif defined(__GNUC__) && !defined(__EXCEPTIONS)
-#  define FMT_USE_EXCEPTIONS 0
-#elif defined(__clang__) && !defined(__cpp_exceptions)
-#  define FMT_USE_EXCEPTIONS 0
-#elif FMT_MSC_VERSION && !_HAS_EXCEPTIONS
-#  define FMT_USE_EXCEPTIONS 0
-#else
-#  define FMT_USE_EXCEPTIONS 1
-#endif
-#if FMT_USE_EXCEPTIONS
-#  define FMT_TRY try
-#  define FMT_CATCH(x) catch (x)
-#else
-#  define FMT_TRY if (true)
-#  define FMT_CATCH(x) if (false)
-#endif
-
-#ifdef FMT_NO_UNIQUE_ADDRESS
-// Use the provided definition.
-#elif FMT_CPLUSPLUS < 202002L
-// Not supported.
-#elif FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
-#  define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
-// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
-#elif FMT_MSC_VERSION >= 1929 && !FMT_CLANG_VERSION
-#  define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
-#endif
-#ifndef FMT_NO_UNIQUE_ADDRESS
-#  define FMT_NO_UNIQUE_ADDRESS
-#endif
-
-#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
-#  define FMT_FALLTHROUGH [[fallthrough]]
-#elif defined(__clang__)
-#  define FMT_FALLTHROUGH [[clang::fallthrough]]
-#elif FMT_GCC_VERSION >= 700 && \
-    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
-#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
-#else
-#  define FMT_FALLTHROUGH
-#endif
-
-// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
-#if FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && !defined(__NVCC__)
-#  define FMT_NORETURN [[noreturn]]
-#else
-#  define FMT_NORETURN
-#endif
-
-#ifdef FMT_NODISCARD
-// Use the provided definition.
-#elif FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
-#  define FMT_NODISCARD [[nodiscard]]
-#else
-#  define FMT_NODISCARD
-#endif
-
-#ifdef FMT_DEPRECATED
-// Use the provided definition.
-#elif FMT_HAS_CPP14_ATTRIBUTE(deprecated)
-#  define FMT_DEPRECATED [[deprecated]]
-#else
-#  define FMT_DEPRECATED /* deprecated */
-#endif
-
-#ifdef FMT_ALWAYS_INLINE
-// Use the provided definition.
-#elif FMT_GCC_VERSION || FMT_CLANG_VERSION
-#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
-#else
-#  define FMT_ALWAYS_INLINE inline
-#endif
-// A version of FMT_ALWAYS_INLINE to prevent code bloat in debug mode.
-#ifdef NDEBUG
-#  define FMT_INLINE FMT_ALWAYS_INLINE
-#else
-#  define FMT_INLINE inline
-#endif
-
-#if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
-#else
-#  define FMT_VISIBILITY(value)
-#endif
-
-// Detect pragmas.
-#define FMT_PRAGMA_IMPL(x) _Pragma(#x)
-#if FMT_GCC_VERSION >= 504 && !defined(__NVCOMPILER)
-// Workaround a _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884
-// and an nvhpc warning: https://github.com/fmtlib/fmt/pull/2582.
-#  define FMT_PRAGMA_GCC(x) FMT_PRAGMA_IMPL(GCC x)
-#else
-#  define FMT_PRAGMA_GCC(x)
-#endif
-#if FMT_CLANG_VERSION
-#  define FMT_PRAGMA_CLANG(x) FMT_PRAGMA_IMPL(clang x)
-#else
-#  define FMT_PRAGMA_CLANG(x)
-#endif
-#if FMT_MSC_VERSION
-#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
-#else
-#  define FMT_MSC_WARNING(...)
-#endif
-
-// LAMMPS customization
-// use 'v11_lmp' namespace instead of 'v11' so that our
-// bundled copy does not collide with linking other code
-// using system wide installations which may be using
-// a different version.
-
-#ifndef FMT_BEGIN_NAMESPACE
-#  define FMT_BEGIN_NAMESPACE \
-    namespace fmt {           \
-    inline namespace v11_lmp {
-#  define FMT_END_NAMESPACE \
-    }                       \
-    }
-#endif
-
-#ifndef FMT_EXPORT
-#  define FMT_EXPORT
-#  define FMT_BEGIN_EXPORT
-#  define FMT_END_EXPORT
-#endif
-
-#ifdef _WIN32
-#  define FMT_WIN32 1
-#else
-#  define FMT_WIN32 0
-#endif
-
-#if !defined(FMT_HEADER_ONLY) && FMT_WIN32
-#  if defined(FMT_LIB_EXPORT)
-#    define FMT_API __declspec(dllexport)
-#  elif defined(FMT_SHARED)
-#    define FMT_API __declspec(dllimport)
-#  endif
-#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
-#  define FMT_API FMT_VISIBILITY("default")
-#endif
-#ifndef FMT_API
-#  define FMT_API
-#endif
-
-#ifndef FMT_OPTIMIZE_SIZE
-#  define FMT_OPTIMIZE_SIZE 0
-#endif
-
-// FMT_BUILTIN_TYPE=0 may result in smaller library size at the cost of higher
-// per-call binary size by passing built-in types through the extension API.
-#ifndef FMT_BUILTIN_TYPES
-#  define FMT_BUILTIN_TYPES 1
-#endif
-
-#define FMT_APPLY_VARIADIC(expr) \
-  using ignore = int[];          \
-  (void)ignore { 0, (expr, 0)... }
-
-// Enable minimal optimizations for more compact code in debug mode.
-FMT_PRAGMA_GCC(push_options)
-#if !defined(__OPTIMIZE__) && !defined(__CUDACC__)
-FMT_PRAGMA_GCC(optimize("Og"))
-#endif
-FMT_PRAGMA_CLANG(diagnostic push)
-
-FMT_BEGIN_NAMESPACE
-
-// Implementations of enable_if_t and other metafunctions for older systems.
-template <bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, typename T, typename F>
-using conditional_t = typename std::conditional<B, T, F>::type;
-template <bool B> using bool_constant = std::integral_constant<bool, B>;
-template <typename T>
-using remove_reference_t = typename std::remove_reference<T>::type;
-template <typename T>
-using remove_const_t = typename std::remove_const<T>::type;
-template <typename T>
-using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
-template <typename T>
-using make_unsigned_t = typename std::make_unsigned<T>::type;
-template <typename T>
-using underlying_t = typename std::underlying_type<T>::type;
-template <typename T> using decay_t = typename std::decay<T>::type;
-using nullptr_t = decltype(nullptr);
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
-// A workaround for gcc 4.9 to make void_t work in a SFINAE context.
-template <typename...> struct void_t_impl {
-  using type = void;
-};
-template <typename... T> using void_t = typename void_t_impl<T...>::type;
-#else
-template <typename...> using void_t = void;
-#endif
-
-struct monostate {
-  constexpr monostate() {}
-};
-
-// An enable_if helper to be used in template parameters which results in much
-// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
-// to workaround a bug in MSVC 2019 (see #1140 and #1186).
-#ifdef FMT_DOC
-#  define FMT_ENABLE_IF(...)
-#else
-#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
-#endif
-
-template <typename T> constexpr auto min_of(T a, T b) -> T {
-  return a < b ? a : b;
-}
-template <typename T> constexpr auto max_of(T a, T b) -> T {
-  return a > b ? a : b;
-}
-
-namespace detail {
-// Suppresses "unused variable" warnings with the method described in
-// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
-// (void)var does not work on many Intel compilers.
-template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
-
-constexpr auto is_constant_evaluated(bool default_value = false) noexcept
-    -> bool {
-// Workaround for incompatibility between clang 14 and libstdc++ consteval-based
-// std::is_constant_evaluated: https://github.com/fmtlib/fmt/issues/3247.
-#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
-    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
-  ignore_unused(default_value);
-  return __builtin_is_constant_evaluated();
-#elif defined(__cpp_lib_is_constant_evaluated)
-  ignore_unused(default_value);
-  return std::is_constant_evaluated();
-#else
-  return default_value;
-#endif
-}
-
-// Suppresses "conditional expression is constant" warnings.
-template <typename T> FMT_ALWAYS_INLINE constexpr auto const_check(T val) -> T {
-  return val;
-}
-
-FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
-                                      const char* message);
-
-#if defined(FMT_ASSERT)
-// Use the provided definition.
-#elif defined(NDEBUG)
-// FMT_ASSERT is not empty to avoid -Wempty-body.
-#  define FMT_ASSERT(condition, message) \
-    fmt::detail::ignore_unused((condition), (message))
-#else
-#  define FMT_ASSERT(condition, message)                                    \
-    ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
-         ? (void)0                                                          \
-         : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
-#endif
-
-#ifdef FMT_USE_INT128
-// Use the provided definition.
-#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
-    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
-#  define FMT_USE_INT128 1
-using int128_opt = __int128_t;  // An optional native 128-bit integer.
-using uint128_opt = __uint128_t;
-inline auto map(int128_opt x) -> int128_opt { return x; }
-inline auto map(uint128_opt x) -> uint128_opt { return x; }
-#else
-#  define FMT_USE_INT128 0
-#endif
-#if !FMT_USE_INT128
-enum class int128_opt {};
-enum class uint128_opt {};
-// Reduce template instantiations.
-inline auto map(int128_opt) -> monostate { return {}; }
-inline auto map(uint128_opt) -> monostate { return {}; }
-#endif
-
-#ifndef FMT_USE_BITINT
-#  define FMT_USE_BITINT (FMT_CLANG_VERSION >= 1500)
-#endif
-
-#if FMT_USE_BITINT
-FMT_PRAGMA_CLANG(diagnostic ignored "-Wbit-int-extension")
-template <int N> using bitint = _BitInt(N);
-template <int N> using ubitint = unsigned _BitInt(N);
-#else
-template <int N> struct bitint {};
-template <int N> struct ubitint {};
-#endif  // FMT_USE_BITINT
-
-// Casts a nonnegative integer to unsigned.
-template <typename Int>
-FMT_CONSTEXPR auto to_unsigned(Int value) -> make_unsigned_t<Int> {
-#if 0
-  // LAMMPS customization: disable assertion to avoid bogus warnings
-  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
-#endif
-  return static_cast<make_unsigned_t<Int>>(value);
-}
-
-template <typename Char>
-using unsigned_char = conditional_t<sizeof(Char) == 1, unsigned char, unsigned>;
-
-// A heuristic to detect std::string and std::[experimental::]string_view.
-// It is mainly used to avoid dependency on <[experimental/]string_view>.
-template <typename T, typename Enable = void>
-struct is_std_string_like : std::false_type {};
-template <typename T>
-struct is_std_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
-                                 typename T::value_type(), 0))>>
-    : std::is_convertible<decltype(std::declval<T>().data()),
-                          const typename T::value_type*> {};
-
-// Check if the literal encoding is UTF-8.
-enum { is_utf8_enabled = "\u00A7"[1] == '\xA7' };
-enum { use_utf8 = !FMT_WIN32 || is_utf8_enabled };
-
-#ifndef FMT_UNICODE
-#  define FMT_UNICODE 1
-#endif
-
-static_assert(!FMT_UNICODE || use_utf8,
-              "Unicode support requires compiling with /utf-8");
-
-template <typename T> constexpr const char* narrow(const T*) { return nullptr; }
-constexpr FMT_ALWAYS_INLINE const char* narrow(const char* s) { return s; }
-
-template <typename Char>
-FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
-    -> int {
-  if (!is_constant_evaluated() && sizeof(Char) == 1) return memcmp(s1, s2, n);
-  for (; n != 0; ++s1, ++s2, --n) {
-    if (*s1 < *s2) return -1;
-    if (*s1 > *s2) return 1;
-  }
-  return 0;
-}
-
-namespace adl {
-using namespace std;
-
-template <typename Container>
-auto invoke_back_inserter()
-    -> decltype(back_inserter(std::declval<Container&>()));
-}  // namespace adl
-
-template <typename It, typename Enable = std::true_type>
-struct is_back_insert_iterator : std::false_type {};
-
-template <typename It>
-struct is_back_insert_iterator<
-    It, bool_constant<std::is_same<
-            decltype(adl::invoke_back_inserter<typename It::container_type>()),
-            It>::value>> : std::true_type {};
-
-// Extracts a reference to the container from *insert_iterator.
-template <typename OutputIt>
-inline FMT_CONSTEXPR20 auto get_container(OutputIt it) ->
-    typename OutputIt::container_type& {
-  struct accessor : OutputIt {
-    FMT_CONSTEXPR20 accessor(OutputIt base) : OutputIt(base) {}
-    using OutputIt::container;
-  };
-  return *accessor(it).container;
-}
-}  // namespace detail
-
-// Parsing-related public API and forward declarations.
-FMT_BEGIN_EXPORT
-
-/**
- * An implementation of `std::basic_string_view` for pre-C++17. It provides a
- * subset of the API. `fmt::basic_string_view` is used for format strings even
- * if `std::basic_string_view` is available to prevent issues when a library is
- * compiled with a different `-std` option than the client code (which is not
- * recommended).
- */
-template <typename Char> class basic_string_view {
- private:
-  const Char* data_;
-  size_t size_;
-
- public:
-  using value_type = Char;
-  using iterator = const Char*;
-
-  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
-
-  /// Constructs a string reference object from a C string and a size.
-  constexpr basic_string_view(const Char* s, size_t count) noexcept
-      : data_(s), size_(count) {}
-
-  constexpr basic_string_view(nullptr_t) = delete;
-
-  /// Constructs a string reference object from a C string.
-#if FMT_GCC_VERSION
-  FMT_ALWAYS_INLINE
-#endif
-  FMT_CONSTEXPR20 basic_string_view(const Char* s) : data_(s) {
-#if FMT_HAS_BUILTIN(__buitin_strlen) || FMT_GCC_VERSION || FMT_CLANG_VERSION
-    if (std::is_same<Char, char>::value) {
-      size_ = __builtin_strlen(detail::narrow(s));
-      return;
-    }
-#endif
-    size_t len = 0;
-    while (*s++) ++len;
-    size_ = len;
-  }
-
-  /// Constructs a string reference from a `std::basic_string` or a
-  /// `std::basic_string_view` object.
-  template <typename S,
-            FMT_ENABLE_IF(detail::is_std_string_like<S>::value&& std::is_same<
-                          typename S::value_type, Char>::value)>
-  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
-      : data_(s.data()), size_(s.size()) {}
-
-  /// Returns a pointer to the string data.
-  constexpr auto data() const noexcept -> const Char* { return data_; }
-
-  /// Returns the string size.
-  constexpr auto size() const noexcept -> size_t { return size_; }
-
-  constexpr auto begin() const noexcept -> iterator { return data_; }
-  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
-
-  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
-    return data_[pos];
-  }
-
-  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
-    data_ += n;
-    size_ -= n;
-  }
-
-  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
-      -> bool {
-    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
-  }
-  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
-    return size_ >= 1 && *data_ == c;
-  }
-  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
-    return starts_with(basic_string_view<Char>(s));
-  }
-
-  // Lexicographically compare this string reference to other.
-  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
-    int result =
-        detail::compare(data_, other.data_, min_of(size_, other.size_));
-    if (result != 0) return result;
-    return size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
-  }
-
-  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
-                                       basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) == 0;
-  }
-  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) != 0;
-  }
-  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) < 0;
-  }
-  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) <= 0;
-  }
-  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) > 0;
-  }
-  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) >= 0;
-  }
-};
-
-using string_view = basic_string_view<char>;
-
-/// Specifies if `T` is an extended character type. Can be specialized by users.
-template <typename T> struct is_xchar : std::false_type {};
-template <> struct is_xchar<wchar_t> : std::true_type {};
-template <> struct is_xchar<char16_t> : std::true_type {};
-template <> struct is_xchar<char32_t> : std::true_type {};
-#ifdef __cpp_char8_t
-template <> struct is_xchar<char8_t> : std::true_type {};
-#endif
-
-// DEPRECATED! Will be replaced with an alias to prevent specializations.
-template <typename T> struct is_char : is_xchar<T> {};
-template <> struct is_char<char> : std::true_type {};
-
-template <typename T> class basic_appender;
-using appender = basic_appender<char>;
-
-// Checks whether T is a container with contiguous storage.
-template <typename T> struct is_contiguous : std::false_type {};
-
-class context;
-template <typename OutputIt, typename Char> class generic_context;
-template <typename Char> class parse_context;
-
-// Longer aliases for C++20 compatibility.
-template <typename Char> using basic_format_parse_context = parse_context<Char>;
-using format_parse_context = parse_context<char>;
-template <typename OutputIt, typename Char>
-using basic_format_context =
-    conditional_t<std::is_same<OutputIt, appender>::value, context,
-                  generic_context<OutputIt, Char>>;
-using format_context = context;
-
-template <typename Char>
-using buffered_context =
-    conditional_t<std::is_same<Char, char>::value, context,
-                  generic_context<basic_appender<Char>, Char>>;
-
-template <typename Context> class basic_format_arg;
-template <typename Context> class basic_format_args;
-
-// A separate type would result in shorter symbols but break ABI compatibility
-// between clang and gcc on ARM (#1919).
-using format_args = basic_format_args<context>;
-
-// A formatter for objects of type T.
-template <typename T, typename Char = char, typename Enable = void>
-struct formatter {
-  // A deleted default constructor indicates a disabled formatter.
-  formatter() = delete;
-};
-
-/// Reports a format error at compile time or, via a `format_error` exception,
-/// at runtime.
-// This function is intentionally not constexpr to give a compile-time error.
-FMT_NORETURN FMT_API void report_error(const char* message);
-
-enum class presentation_type : unsigned char {
-  // Common specifiers:
-  none = 0,
-  debug = 1,   // '?'
-  string = 2,  // 's' (string, bool)
-
-  // Integral, bool and character specifiers:
-  dec = 3,  // 'd'
-  hex,      // 'x' or 'X'
-  oct,      // 'o'
-  bin,      // 'b' or 'B'
-  chr,      // 'c'
-
-  // String and pointer specifiers:
-  pointer = 3,  // 'p'
-
-  // Floating-point specifiers:
-  exp = 1,  // 'e' or 'E' (1 since there is no FP debug presentation)
-  fixed,    // 'f' or 'F'
-  general,  // 'g' or 'G'
-  hexfloat  // 'a' or 'A'
-};
-
-enum class align { none, left, right, center, numeric };
-enum class sign { none, minus, plus, space };
-enum class arg_id_kind { none, index, name };
-
-// Basic format specifiers for built-in and string types.
-class basic_specs {
- private:
-  // Data is arranged as follows:
-  //
-  //  0                   1                   2                   3
-  //  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-  // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-  // |type |align| w | p | s |u|#|L|  f  |          unused           |
-  // +-----+-----+---+---+---+-+-+-+-----+---------------------------+
-  //
-  //   w - dynamic width info
-  //   p - dynamic precision info
-  //   s - sign
-  //   u - uppercase (e.g. 'X' for 'x')
-  //   # - alternate form ('#')
-  //   L - localized
-  //   f - fill size
-  //
-  // Bitfields are not used because of compiler bugs such as gcc bug 61414.
-  enum : unsigned {
-    type_mask = 0x00007,
-    align_mask = 0x00038,
-    width_mask = 0x000C0,
-    precision_mask = 0x00300,
-    sign_mask = 0x00C00,
-    uppercase_mask = 0x01000,
-    alternate_mask = 0x02000,
-    localized_mask = 0x04000,
-    fill_size_mask = 0x38000,
-
-    align_shift = 3,
-    width_shift = 6,
-    precision_shift = 8,
-    sign_shift = 10,
-    fill_size_shift = 15,
-
-    max_fill_size = 4
-  };
-
-  size_t data_ = 1 << fill_size_shift;
-
-  // Character (code unit) type is erased to prevent template bloat.
-  char fill_data_[max_fill_size] = {' '};
-
-  FMT_CONSTEXPR void set_fill_size(size_t size) {
-    data_ = (data_ & ~fill_size_mask) | (size << fill_size_shift);
-  }
-
- public:
-  constexpr auto type() const -> presentation_type {
-    return static_cast<presentation_type>(data_ & type_mask);
-  }
-  FMT_CONSTEXPR void set_type(presentation_type t) {
-    data_ = (data_ & ~type_mask) | static_cast<unsigned>(t);
-  }
-
-  constexpr auto align() const -> align {
-    return static_cast<fmt::align>((data_ & align_mask) >> align_shift);
-  }
-  FMT_CONSTEXPR void set_align(fmt::align a) {
-    data_ = (data_ & ~align_mask) | (static_cast<unsigned>(a) << align_shift);
-  }
-
-  constexpr auto dynamic_width() const -> arg_id_kind {
-    return static_cast<arg_id_kind>((data_ & width_mask) >> width_shift);
-  }
-  FMT_CONSTEXPR void set_dynamic_width(arg_id_kind w) {
-    data_ = (data_ & ~width_mask) | (static_cast<unsigned>(w) << width_shift);
-  }
-
-  FMT_CONSTEXPR auto dynamic_precision() const -> arg_id_kind {
-    return static_cast<arg_id_kind>((data_ & precision_mask) >>
-                                    precision_shift);
-  }
-  FMT_CONSTEXPR void set_dynamic_precision(arg_id_kind p) {
-    data_ = (data_ & ~precision_mask) |
-            (static_cast<unsigned>(p) << precision_shift);
-  }
-
-  constexpr bool dynamic() const {
-    return (data_ & (width_mask | precision_mask)) != 0;
-  }
-
-  constexpr auto sign() const -> sign {
-    return static_cast<fmt::sign>((data_ & sign_mask) >> sign_shift);
-  }
-  FMT_CONSTEXPR void set_sign(fmt::sign s) {
-    data_ = (data_ & ~sign_mask) | (static_cast<unsigned>(s) << sign_shift);
-  }
-
-  constexpr auto upper() const -> bool { return (data_ & uppercase_mask) != 0; }
-  FMT_CONSTEXPR void set_upper() { data_ |= uppercase_mask; }
-
-  constexpr auto alt() const -> bool { return (data_ & alternate_mask) != 0; }
-  FMT_CONSTEXPR void set_alt() { data_ |= alternate_mask; }
-  FMT_CONSTEXPR void clear_alt() { data_ &= ~alternate_mask; }
-
-  constexpr auto localized() const -> bool {
-    return (data_ & localized_mask) != 0;
-  }
-  FMT_CONSTEXPR void set_localized() { data_ |= localized_mask; }
-
-  constexpr auto fill_size() const -> size_t {
-    return (data_ & fill_size_mask) >> fill_size_shift;
-  }
-
-  template <typename Char, FMT_ENABLE_IF(std::is_same<Char, char>::value)>
-  constexpr auto fill() const -> const Char* {
-    return fill_data_;
-  }
-  template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
-  constexpr auto fill() const -> const Char* {
-    return nullptr;
-  }
-
-  template <typename Char> constexpr auto fill_unit() const -> Char {
-    using uchar = unsigned char;
-    return static_cast<Char>(static_cast<uchar>(fill_data_[0]) |
-                             (static_cast<uchar>(fill_data_[1]) << 8) |
-                             (static_cast<uchar>(fill_data_[2]) << 16));
-  }
-
-  FMT_CONSTEXPR void set_fill(char c) {
-    fill_data_[0] = c;
-    set_fill_size(1);
-  }
-
-  template <typename Char>
-  FMT_CONSTEXPR void set_fill(basic_string_view<Char> s) {
-    auto size = s.size();
-    set_fill_size(size);
-    if (size == 1) {
-      unsigned uchar = static_cast<detail::unsigned_char<Char>>(s[0]);
-      fill_data_[0] = static_cast<char>(uchar);
-      fill_data_[1] = static_cast<char>(uchar >> 8);
-      fill_data_[2] = static_cast<char>(uchar >> 16);
-      return;
-    }
-    FMT_ASSERT(size <= max_fill_size, "invalid fill");
-    for (size_t i = 0; i < size; ++i)
-      fill_data_[i & 3] = static_cast<char>(s[i]);
-  }
-
-  FMT_CONSTEXPR void copy_fill_from(const basic_specs& specs) {
-    set_fill_size(specs.fill_size());
-    for (size_t i = 0; i < max_fill_size; ++i)
-      fill_data_[i] = specs.fill_data_[i];
-  }
-};
-
-// Format specifiers for built-in and string types.
-struct format_specs : basic_specs {
-  int width;
-  int precision;
-
-  constexpr format_specs() : width(0), precision(-1) {}
-};
-
-/**
- * Parsing context consisting of a format string range being parsed and an
- * argument counter for automatic indexing.
- */
-template <typename Char = char> class parse_context {
- private:
-  basic_string_view<Char> fmt_;
-  int next_arg_id_;
-
-  enum { use_constexpr_cast = !FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200 };
-
-  FMT_CONSTEXPR void do_check_arg_id(int arg_id);
-
- public:
-  using char_type = Char;
-  using iterator = const Char*;
-
-  constexpr explicit parse_context(basic_string_view<Char> fmt,
-                                   int next_arg_id = 0)
-      : fmt_(fmt), next_arg_id_(next_arg_id) {}
-
-  /// Returns an iterator to the beginning of the format string range being
-  /// parsed.
-  constexpr auto begin() const noexcept -> iterator { return fmt_.begin(); }
-
-  /// Returns an iterator past the end of the format string range being parsed.
-  constexpr auto end() const noexcept -> iterator { return fmt_.end(); }
-
-  /// Advances the begin iterator to `it`.
-  FMT_CONSTEXPR void advance_to(iterator it) {
-    fmt_.remove_prefix(detail::to_unsigned(it - begin()));
-  }
-
-  /// Reports an error if using the manual argument indexing; otherwise returns
-  /// the next argument index and switches to the automatic indexing.
-  FMT_CONSTEXPR auto next_arg_id() -> int {
-    if (next_arg_id_ < 0) {
-      report_error("cannot switch from manual to automatic argument indexing");
-      return 0;
-    }
-    int id = next_arg_id_++;
-    do_check_arg_id(id);
-    return id;
-  }
-
-  /// Reports an error if using the automatic argument indexing; otherwise
-  /// switches to the manual indexing.
-  FMT_CONSTEXPR void check_arg_id(int id) {
-    if (next_arg_id_ > 0) {
-      report_error("cannot switch from automatic to manual argument indexing");
-      return;
-    }
-    next_arg_id_ = -1;
-    do_check_arg_id(id);
-  }
-  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {
-    next_arg_id_ = -1;
-  }
-  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
-};
-
-FMT_END_EXPORT
-
-namespace detail {
-
-// Constructs fmt::basic_string_view<Char> from types implicitly convertible
-// to it, deducing Char. Explicitly convertible types such as the ones returned
-// from FMT_STRING are intentionally excluded.
-template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
-constexpr auto to_string_view(const Char* s) -> basic_string_view<Char> {
-  return s;
-}
-template <typename T, FMT_ENABLE_IF(is_std_string_like<T>::value)>
-constexpr auto to_string_view(const T& s)
-    -> basic_string_view<typename T::value_type> {
-  return s;
-}
-template <typename Char>
-constexpr auto to_string_view(basic_string_view<Char> s)
-    -> basic_string_view<Char> {
-  return s;
-}
-
-template <typename T, typename Enable = void>
-struct has_to_string_view : std::false_type {};
-// detail:: is intentional since to_string_view is not an extension point.
-template <typename T>
-struct has_to_string_view<
-    T, void_t<decltype(detail::to_string_view(std::declval<T>()))>>
-    : std::true_type {};
-
-/// String's character (code unit) type. detail:: is intentional to prevent ADL.
-template <typename S,
-          typename V = decltype(detail::to_string_view(std::declval<S>()))>
-using char_t = typename V::value_type;
-
-enum class type {
-  none_type,
-  // Integer types should go first,
-  int_type,
-  uint_type,
-  long_long_type,
-  ulong_long_type,
-  int128_type,
-  uint128_type,
-  bool_type,
-  char_type,
-  last_integer_type = char_type,
-  // followed by floating-point types.
-  float_type,
-  double_type,
-  long_double_type,
-  last_numeric_type = long_double_type,
-  cstring_type,
-  string_type,
-  pointer_type,
-  custom_type
-};
-
-// Maps core type T to the corresponding type enum constant.
-template <typename T, typename Char>
-struct type_constant : std::integral_constant<type, type::custom_type> {};
-
-#define FMT_TYPE_CONSTANT(Type, constant) \
-  template <typename Char>                \
-  struct type_constant<Type, Char>        \
-      : std::integral_constant<type, type::constant> {}
-
-FMT_TYPE_CONSTANT(int, int_type);
-FMT_TYPE_CONSTANT(unsigned, uint_type);
-FMT_TYPE_CONSTANT(long long, long_long_type);
-FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
-FMT_TYPE_CONSTANT(int128_opt, int128_type);
-FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
-FMT_TYPE_CONSTANT(bool, bool_type);
-FMT_TYPE_CONSTANT(Char, char_type);
-FMT_TYPE_CONSTANT(float, float_type);
-FMT_TYPE_CONSTANT(double, double_type);
-FMT_TYPE_CONSTANT(long double, long_double_type);
-FMT_TYPE_CONSTANT(const Char*, cstring_type);
-FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
-FMT_TYPE_CONSTANT(const void*, pointer_type);
-
-constexpr auto is_integral_type(type t) -> bool {
-  return t > type::none_type && t <= type::last_integer_type;
-}
-constexpr auto is_arithmetic_type(type t) -> bool {
-  return t > type::none_type && t <= type::last_numeric_type;
-}
-
-constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
-constexpr auto in(type t, int set) -> bool {
-  return ((set >> static_cast<int>(t)) & 1) != 0;
-}
-
-// Bitsets of types.
-enum {
-  sint_set =
-      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
-  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
-             set(type::uint128_type),
-  bool_set = set(type::bool_type),
-  char_set = set(type::char_type),
-  float_set = set(type::float_type) | set(type::double_type) |
-              set(type::long_double_type),
-  string_set = set(type::string_type),
-  cstring_set = set(type::cstring_type),
-  pointer_set = set(type::pointer_type)
-};
-
-struct view {};
-
-template <typename Char, typename T> struct named_arg;
-template <typename T> struct is_named_arg : std::false_type {};
-template <typename T> struct is_static_named_arg : std::false_type {};
-
-template <typename Char, typename T>
-struct is_named_arg<named_arg<Char, T>> : std::true_type {};
-
-template <typename Char, typename T> struct named_arg : view {
-  const Char* name;
-  const T& value;
-
-  named_arg(const Char* n, const T& v) : name(n), value(v) {}
-  static_assert(!is_named_arg<T>::value, "nested named arguments");
-};
-
-template <bool B = false> constexpr auto count() -> int { return B ? 1 : 0; }
-template <bool B1, bool B2, bool... Tail> constexpr auto count() -> int {
-  return (B1 ? 1 : 0) + count<B2, Tail...>();
-}
-
-template <typename... Args> constexpr auto count_named_args() -> int {
-  return count<is_named_arg<Args>::value...>();
-}
-template <typename... Args> constexpr auto count_static_named_args() -> int {
-  return count<is_static_named_arg<Args>::value...>();
-}
-
-template <typename Char> struct named_arg_info {
-  const Char* name;
-  int id;
-};
-
-template <typename Char, typename T, FMT_ENABLE_IF(!is_named_arg<T>::value)>
-void init_named_arg(named_arg_info<Char>*, int& arg_index, int&, const T&) {
-  ++arg_index;
-}
-template <typename Char, typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
-void init_named_arg(named_arg_info<Char>* named_args, int& arg_index,
-                    int& named_arg_index, const T& arg) {
-  named_args[named_arg_index++] = {arg.name, arg_index++};
-}
-
-template <typename T, typename Char,
-          FMT_ENABLE_IF(!is_static_named_arg<T>::value)>
-FMT_CONSTEXPR void init_static_named_arg(named_arg_info<Char>*, int& arg_index,
-                                         int&) {
-  ++arg_index;
-}
-template <typename T, typename Char,
-          FMT_ENABLE_IF(is_static_named_arg<T>::value)>
-FMT_CONSTEXPR void init_static_named_arg(named_arg_info<Char>* named_args,
-                                         int& arg_index, int& named_arg_index) {
-  named_args[named_arg_index++] = {T::name, arg_index++};
-}
-
-// To minimize the number of types we need to deal with, long is translated
-// either to int or to long long depending on its size.
-enum { long_short = sizeof(long) == sizeof(int) };
-using long_type = conditional_t<long_short, int, long long>;
-using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
-
-template <typename T>
-using format_as_result =
-    remove_cvref_t<decltype(format_as(std::declval<const T&>()))>;
-template <typename T>
-using format_as_member_result =
-    remove_cvref_t<decltype(formatter<T>::format_as(std::declval<const T&>()))>;
-
-template <typename T, typename Enable = std::true_type>
-struct use_format_as : std::false_type {};
-// format_as member is only used to avoid injection into the std namespace.
-template <typename T, typename Enable = std::true_type>
-struct use_format_as_member : std::false_type {};
-
-// Only map owning types because mapping views can be unsafe.
-template <typename T>
-struct use_format_as<
-    T, bool_constant<std::is_arithmetic<format_as_result<T>>::value>>
-    : std::true_type {};
-template <typename T>
-struct use_format_as_member<
-    T, bool_constant<std::is_arithmetic<format_as_member_result<T>>::value>>
-    : std::true_type {};
-
-template <typename T, typename U = remove_const_t<T>>
-using use_formatter =
-    bool_constant<(std::is_class<T>::value || std::is_enum<T>::value ||
-                   std::is_union<T>::value || std::is_array<T>::value) &&
-                  !has_to_string_view<T>::value && !is_named_arg<T>::value &&
-                  !use_format_as<T>::value && !use_format_as_member<T>::value>;
-
-template <typename Char, typename T, typename U = remove_const_t<T>>
-auto has_formatter_impl(T* p, buffered_context<Char>* ctx = nullptr)
-    -> decltype(formatter<U, Char>().format(*p, *ctx), std::true_type());
-template <typename Char> auto has_formatter_impl(...) -> std::false_type;
-
-// T can be const-qualified to check if it is const-formattable.
-template <typename T, typename Char> constexpr auto has_formatter() -> bool {
-  return decltype(has_formatter_impl<Char>(static_cast<T*>(nullptr)))::value;
-}
-
-// Maps formatting argument types to natively supported types or user-defined
-// types with formatters. Returns void on errors to be SFINAE-friendly.
-template <typename Char> struct type_mapper {
-  static auto map(signed char) -> int;
-  static auto map(unsigned char) -> unsigned;
-  static auto map(short) -> int;
-  static auto map(unsigned short) -> unsigned;
-  static auto map(int) -> int;
-  static auto map(unsigned) -> unsigned;
-  static auto map(long) -> long_type;
-  static auto map(unsigned long) -> ulong_type;
-  static auto map(long long) -> long long;
-  static auto map(unsigned long long) -> unsigned long long;
-  static auto map(int128_opt) -> int128_opt;
-  static auto map(uint128_opt) -> uint128_opt;
-  static auto map(bool) -> bool;
-
-  template <int N>
-  static auto map(bitint<N>) -> conditional_t<N <= 64, long long, void>;
-  template <int N>
-  static auto map(ubitint<N>)
-      -> conditional_t<N <= 64, unsigned long long, void>;
-
-  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
-  static auto map(T) -> conditional_t<
-      std::is_same<T, char>::value || std::is_same<T, Char>::value, Char, void>;
-
-  static auto map(float) -> float;
-  static auto map(double) -> double;
-  static auto map(long double) -> long double;
-
-  static auto map(Char*) -> const Char*;
-  static auto map(const Char*) -> const Char*;
-  template <typename T, typename C = char_t<T>,
-            FMT_ENABLE_IF(!std::is_pointer<T>::value)>
-  static auto map(const T&) -> conditional_t<std::is_same<C, Char>::value,
-                                             basic_string_view<C>, void>;
-
-  static auto map(void*) -> const void*;
-  static auto map(const void*) -> const void*;
-  static auto map(volatile void*) -> const void*;
-  static auto map(const volatile void*) -> const void*;
-  static auto map(nullptr_t) -> const void*;
-  template <typename T, FMT_ENABLE_IF(std::is_pointer<T>::value ||
-                                      std::is_member_pointer<T>::value)>
-  static auto map(const T&) -> void;
-
-  template <typename T, FMT_ENABLE_IF(use_format_as<T>::value)>
-  static auto map(const T& x) -> decltype(map(format_as(x)));
-  template <typename T, FMT_ENABLE_IF(use_format_as_member<T>::value)>
-  static auto map(const T& x) -> decltype(map(formatter<T>::format_as(x)));
-
-  template <typename T, FMT_ENABLE_IF(use_formatter<T>::value)>
-  static auto map(T&) -> conditional_t<has_formatter<T, Char>(), T&, void>;
-
-  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
-  static auto map(const T& named_arg) -> decltype(map(named_arg.value));
-};
-
-// detail:: is used to workaround a bug in MSVC 2017.
-template <typename T, typename Char>
-using mapped_t = decltype(detail::type_mapper<Char>::map(std::declval<T&>()));
-
-// A type constant after applying type_mapper.
-template <typename T, typename Char = char>
-using mapped_type_constant = type_constant<mapped_t<T, Char>, Char>;
-
-template <typename T, typename Context,
-          type TYPE =
-              mapped_type_constant<T, typename Context::char_type>::value>
-using stored_type_constant = std::integral_constant<
-    type, Context::builtin_types || TYPE == type::int_type ? TYPE
-                                                           : type::custom_type>;
-// A parse context with extra data used only in compile-time checks.
-template <typename Char>
-class compile_parse_context : public parse_context<Char> {
- private:
-  int num_args_;
-  const type* types_;
-  using base = parse_context<Char>;
-
- public:
-  FMT_CONSTEXPR explicit compile_parse_context(basic_string_view<Char> fmt,
-                                               int num_args, const type* types,
-                                               int next_arg_id = 0)
-      : base(fmt, next_arg_id), num_args_(num_args), types_(types) {}
-
-  constexpr auto num_args() const -> int { return num_args_; }
-  constexpr auto arg_type(int id) const -> type { return types_[id]; }
-
-  FMT_CONSTEXPR auto next_arg_id() -> int {
-    int id = base::next_arg_id();
-    if (id >= num_args_) report_error("argument not found");
-    return id;
-  }
-
-  FMT_CONSTEXPR void check_arg_id(int id) {
-    base::check_arg_id(id);
-    if (id >= num_args_) report_error("argument not found");
-  }
-  using base::check_arg_id;
-
-  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
-    ignore_unused(arg_id);
-    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
-      report_error("width/precision is not integer");
-  }
-};
-
-// An argument reference.
-template <typename Char> union arg_ref {
-  FMT_CONSTEXPR arg_ref(int idx = 0) : index(idx) {}
-  FMT_CONSTEXPR arg_ref(basic_string_view<Char> n) : name(n) {}
-
-  int index;
-  basic_string_view<Char> name;
-};
-
-// Format specifiers with width and precision resolved at formatting rather
-// than parsing time to allow reusing the same parsed specifiers with
-// different sets of arguments (precompilation of format strings).
-template <typename Char = char> struct dynamic_format_specs : format_specs {
-  arg_ref<Char> width_ref;
-  arg_ref<Char> precision_ref;
-};
-
-// Converts a character to ASCII. Returns '\0' on conversion failure.
-template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
-constexpr auto to_ascii(Char c) -> char {
-  return c <= 0xff ? static_cast<char>(c) : '\0';
-}
-
-// Returns the number of code units in a code point or 1 on error.
-template <typename Char>
-FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
-  if (const_check(sizeof(Char) != 1)) return 1;
-  auto c = static_cast<unsigned char>(*begin);
-  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 3) + 1;
-}
-
-// Parses the range [begin, end) as an unsigned integer. This function assumes
-// that the range is non-empty and the first character is a digit.
-template <typename Char>
-FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
-                                         int error_value) noexcept -> int {
-  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
-  unsigned value = 0, prev = 0;
-  auto p = begin;
-  do {
-    prev = value;
-    value = value * 10 + unsigned(*p - '0');
-    ++p;
-  } while (p != end && '0' <= *p && *p <= '9');
-  auto num_digits = p - begin;
-  begin = p;
-  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
-  if (num_digits <= digits10) return static_cast<int>(value);
-  // Check for overflow.
-  unsigned max = INT_MAX;
-  return num_digits == digits10 + 1 &&
-                 prev * 10ull + unsigned(p[-1] - '0') <= max
-             ? static_cast<int>(value)
-             : error_value;
-}
-
-FMT_CONSTEXPR inline auto parse_align(char c) -> align {
-  switch (c) {
-  case '<': return align::left;
-  case '>': return align::right;
-  case '^': return align::center;
-  }
-  return align::none;
-}
-
-template <typename Char> constexpr auto is_name_start(Char c) -> bool {
-  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_arg_id(const Char* begin, const Char* end,
-                                Handler&& handler) -> const Char* {
-  Char c = *begin;
-  if (c >= '0' && c <= '9') {
-    int index = 0;
-    if (c != '0')
-      index = parse_nonnegative_int(begin, end, INT_MAX);
-    else
-      ++begin;
-    if (begin == end || (*begin != '}' && *begin != ':'))
-      report_error("invalid format string");
-    else
-      handler.on_index(index);
-    return begin;
-  }
-  if (FMT_OPTIMIZE_SIZE > 1 || !is_name_start(c)) {
-    report_error("invalid format string");
-    return begin;
-  }
-  auto it = begin;
-  do {
-    ++it;
-  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
-  handler.on_name({begin, to_unsigned(it - begin)});
-  return it;
-}
-
-template <typename Char> struct dynamic_spec_handler {
-  parse_context<Char>& ctx;
-  arg_ref<Char>& ref;
-  arg_id_kind& kind;
-
-  FMT_CONSTEXPR void on_index(int id) {
-    ref = id;
-    kind = arg_id_kind::index;
-    ctx.check_arg_id(id);
-    ctx.check_dynamic_spec(id);
-  }
-  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
-    ref = id;
-    kind = arg_id_kind::name;
-    ctx.check_arg_id(id);
-  }
-};
-
-template <typename Char> struct parse_dynamic_spec_result {
-  const Char* end;
-  arg_id_kind kind;
-};
-
-// Parses integer | "{" [arg_id] "}".
-template <typename Char>
-FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
-                                      int& value, arg_ref<Char>& ref,
-                                      parse_context<Char>& ctx)
-    -> parse_dynamic_spec_result<Char> {
-  FMT_ASSERT(begin != end, "");
-  auto kind = arg_id_kind::none;
-  if ('0' <= *begin && *begin <= '9') {
-    int val = parse_nonnegative_int(begin, end, -1);
-    if (val == -1) report_error("number is too big");
-    value = val;
-  } else {
-    if (*begin == '{') {
-      ++begin;
-      if (begin != end) {
-        Char c = *begin;
-        if (c == '}' || c == ':') {
-          int id = ctx.next_arg_id();
-          ref = id;
-          kind = arg_id_kind::index;
-          ctx.check_dynamic_spec(id);
-        } else {
-          begin = parse_arg_id(begin, end,
-                               dynamic_spec_handler<Char>{ctx, ref, kind});
-        }
-      }
-      if (begin != end && *begin == '}') return {++begin, kind};
-    }
-    report_error("invalid format string");
-  }
-  return {begin, kind};
-}
-
-template <typename Char>
-FMT_CONSTEXPR auto parse_width(const Char* begin, const Char* end,
-                               format_specs& specs, arg_ref<Char>& width_ref,
-                               parse_context<Char>& ctx) -> const Char* {
-  auto result = parse_dynamic_spec(begin, end, specs.width, width_ref, ctx);
-  specs.set_dynamic_width(result.kind);
-  return result.end;
-}
-
-template <typename Char>
-FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
-                                   format_specs& specs,
-                                   arg_ref<Char>& precision_ref,
-                                   parse_context<Char>& ctx) -> const Char* {
-  ++begin;
-  if (begin == end) {
-    report_error("invalid precision");
-    return begin;
-  }
-  auto result =
-      parse_dynamic_spec(begin, end, specs.precision, precision_ref, ctx);
-  specs.set_dynamic_precision(result.kind);
-  return result.end;
-}
-
-enum class state { start, align, sign, hash, zero, width, precision, locale };
-
-// Parses standard format specifiers.
-template <typename Char>
-FMT_CONSTEXPR auto parse_format_specs(const Char* begin, const Char* end,
-                                      dynamic_format_specs<Char>& specs,
-                                      parse_context<Char>& ctx, type arg_type)
-    -> const Char* {
-  auto c = '\0';
-  if (end - begin > 1) {
-    auto next = to_ascii(begin[1]);
-    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
-  } else {
-    if (begin == end) return begin;
-    c = to_ascii(*begin);
-  }
-
-  struct {
-    state current_state = state::start;
-    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
-      if (current_state >= s || !valid)
-        report_error("invalid format specifier");
-      current_state = s;
-    }
-  } enter_state;
-
-  using pres = presentation_type;
-  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
-  struct {
-    const Char*& begin;
-    format_specs& specs;
-    type arg_type;
-
-    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
-      if (!in(arg_type, set)) report_error("invalid format specifier");
-      specs.set_type(pres_type);
-      return begin + 1;
-    }
-  } parse_presentation_type{begin, specs, arg_type};
-
-  for (;;) {
-    switch (c) {
-    case '<':
-    case '>':
-    case '^':
-      enter_state(state::align);
-      specs.set_align(parse_align(c));
-      ++begin;
-      break;
-    case '+':
-    case ' ':
-      specs.set_sign(c == ' ' ? sign::space : sign::plus);
-      FMT_FALLTHROUGH;
-    case '-':
-      enter_state(state::sign, in(arg_type, sint_set | float_set));
-      ++begin;
-      break;
-    case '#':
-      enter_state(state::hash, is_arithmetic_type(arg_type));
-      specs.set_alt();
-      ++begin;
-      break;
-    case '0':
-      enter_state(state::zero);
-      if (!is_arithmetic_type(arg_type))
-        report_error("format specifier requires numeric argument");
-      if (specs.align() == align::none) {
-        // Ignore 0 if align is specified for compatibility with std::format.
-        specs.set_align(align::numeric);
-        specs.set_fill('0');
-      }
-      ++begin;
-      break;
-      // clang-format off
-    case '1': case '2': case '3': case '4': case '5':
-    case '6': case '7': case '8': case '9': case '{':
-      // clang-format on
-      enter_state(state::width);
-      begin = parse_width(begin, end, specs, specs.width_ref, ctx);
-      break;
-    case '.':
-      enter_state(state::precision,
-                  in(arg_type, float_set | string_set | cstring_set));
-      begin = parse_precision(begin, end, specs, specs.precision_ref, ctx);
-      break;
-    case 'L':
-      enter_state(state::locale, is_arithmetic_type(arg_type));
-      specs.set_localized();
-      ++begin;
-      break;
-    case 'd': return parse_presentation_type(pres::dec, integral_set);
-    case 'X': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'x': return parse_presentation_type(pres::hex, integral_set);
-    case 'o': return parse_presentation_type(pres::oct, integral_set);
-    case 'B': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'b': return parse_presentation_type(pres::bin, integral_set);
-    case 'E': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'e': return parse_presentation_type(pres::exp, float_set);
-    case 'F': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'f': return parse_presentation_type(pres::fixed, float_set);
-    case 'G': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'g': return parse_presentation_type(pres::general, float_set);
-    case 'A': specs.set_upper(); FMT_FALLTHROUGH;
-    case 'a': return parse_presentation_type(pres::hexfloat, float_set);
-    case 'c':
-      if (arg_type == type::bool_type) report_error("invalid format specifier");
-      return parse_presentation_type(pres::chr, integral_set);
-    case 's':
-      return parse_presentation_type(pres::string,
-                                     bool_set | string_set | cstring_set);
-    case 'p':
-      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
-    case '?':
-      return parse_presentation_type(pres::debug,
-                                     char_set | string_set | cstring_set);
-    case '}': return begin;
-    default:  {
-      if (*begin == '}') return begin;
-      // Parse fill and alignment.
-      auto fill_end = begin + code_point_length(begin);
-      if (end - fill_end <= 0) {
-        report_error("invalid format specifier");
-        return begin;
-      }
-      if (*begin == '{') {
-        report_error("invalid fill character '{'");
-        return begin;
-      }
-      auto alignment = parse_align(to_ascii(*fill_end));
-      enter_state(state::align, alignment != align::none);
-      specs.set_fill(
-          basic_string_view<Char>(begin, to_unsigned(fill_end - begin)));
-      specs.set_align(alignment);
-      begin = fill_end + 1;
-    }
-    }
-    if (begin == end) return begin;
-    c = to_ascii(*begin);
-  }
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR FMT_INLINE auto parse_replacement_field(const Char* begin,
-                                                      const Char* end,
-                                                      Handler&& handler)
-    -> const Char* {
-  ++begin;
-  if (begin == end) {
-    handler.on_error("invalid format string");
-    return end;
-  }
-  int arg_id = 0;
-  switch (*begin) {
-  case '}':
-    handler.on_replacement_field(handler.on_arg_id(), begin);
-    return begin + 1;
-  case '{': handler.on_text(begin, begin + 1); return begin + 1;
-  case ':': arg_id = handler.on_arg_id(); break;
-  default:  {
-    struct id_adapter {
-      Handler& handler;
-      int arg_id;
-
-      FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
-      FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
-        arg_id = handler.on_arg_id(id);
-      }
-    } adapter = {handler, 0};
-    begin = parse_arg_id(begin, end, adapter);
-    arg_id = adapter.arg_id;
-    Char c = begin != end ? *begin : Char();
-    if (c == '}') {
-      handler.on_replacement_field(arg_id, begin);
-      return begin + 1;
-    }
-    if (c != ':') {
-      handler.on_error("missing '}' in format string");
-      return end;
-    }
-    break;
-  }
-  }
-  begin = handler.on_format_specs(arg_id, begin + 1, end);
-  if (begin == end || *begin != '}')
-    return handler.on_error("unknown format specifier"), end;
-  return begin + 1;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR void parse_format_string(basic_string_view<Char> fmt,
-                                       Handler&& handler) {
-  auto begin = fmt.data(), end = begin + fmt.size();
-  auto p = begin;
-  while (p != end) {
-    auto c = *p++;
-    if (c == '{') {
-      handler.on_text(begin, p - 1);
-      begin = p = parse_replacement_field(p - 1, end, handler);
-    } else if (c == '}') {
-      if (p == end || *p != '}')
-        return handler.on_error("unmatched '}' in format string");
-      handler.on_text(begin, p);
-      begin = ++p;
-    }
-  }
-  handler.on_text(begin, end);
-}
-
-// Checks char specs and returns true iff the presentation type is char-like.
-FMT_CONSTEXPR inline auto check_char_specs(const format_specs& specs) -> bool {
-  auto type = specs.type();
-  if (type != presentation_type::none && type != presentation_type::chr &&
-      type != presentation_type::debug) {
-    return false;
-  }
-  if (specs.align() == align::numeric || specs.sign() != sign::none ||
-      specs.alt()) {
-    report_error("invalid format specifier for char");
-  }
-  return true;
-}
-
-// A base class for compile-time strings.
-struct compile_string {};
-
-template <typename T, typename Char>
-FMT_VISIBILITY("hidden")  // Suppress an ld warning on macOS (#3769).
-FMT_CONSTEXPR auto invoke_parse(parse_context<Char>& ctx) -> const Char* {
-  using mapped_type = remove_cvref_t<mapped_t<T, Char>>;
-  constexpr bool formattable =
-      std::is_constructible<formatter<mapped_type, Char>>::value;
-  if (!formattable) return ctx.begin();  // Error is reported in the value ctor.
-  using formatted_type = conditional_t<formattable, mapped_type, int>;
-  return formatter<formatted_type, Char>().parse(ctx);
-}
-
-template <typename... T> struct arg_pack {};
-
-template <typename Char, int NUM_ARGS, int NUM_NAMED_ARGS, bool DYNAMIC_NAMES>
-class format_string_checker {
- private:
-  type types_[max_of(1, NUM_ARGS)];
-  named_arg_info<Char> named_args_[max_of(1, NUM_NAMED_ARGS)];
-  compile_parse_context<Char> context_;
-
-  using parse_func = auto (*)(parse_context<Char>&) -> const Char*;
-  parse_func parse_funcs_[max_of(1, NUM_ARGS)];
-
- public:
-  template <typename... T>
-  FMT_CONSTEXPR explicit format_string_checker(basic_string_view<Char> fmt,
-                                               arg_pack<T...>)
-      : types_{mapped_type_constant<T, Char>::value...},
-        named_args_{},
-        context_(fmt, NUM_ARGS, types_),
-        parse_funcs_{&invoke_parse<T, Char>...} {
-    int arg_index = 0, named_arg_index = 0;
-    FMT_APPLY_VARIADIC(
-        init_static_named_arg<T>(named_args_, arg_index, named_arg_index));
-    ignore_unused(arg_index, named_arg_index);
-  }
-
-  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-
-  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
-  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-    context_.check_arg_id(id);
-    return id;
-  }
-  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
-    for (int i = 0; i < NUM_NAMED_ARGS; ++i) {
-      if (named_args_[i].name == id) return named_args_[i].id;
-    }
-    if (!DYNAMIC_NAMES) on_error("argument not found");
-    return -1;
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
-    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
-  }
-
-  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char* end)
-      -> const Char* {
-    context_.advance_to(begin);
-    if (id >= 0 && id < NUM_ARGS) return parse_funcs_[id](context_);
-    while (begin != end && *begin != '}') ++begin;
-    return begin;
-  }
-
-  FMT_NORETURN FMT_CONSTEXPR void on_error(const char* message) {
-    report_error(message);
-  }
-};
-
-/// A contiguous memory buffer with an optional growing ability. It is an
-/// internal class and shouldn't be used directly, only via `memory_buffer`.
-template <typename T> class buffer {
- private:
-  T* ptr_;
-  size_t size_;
-  size_t capacity_;
-
-  using grow_fun = void (*)(buffer& buf, size_t capacity);
-  grow_fun grow_;
-
- protected:
-  // Don't initialize ptr_ since it is not accessed to save a few cycles.
-  FMT_MSC_WARNING(suppress : 26495)
-  FMT_CONSTEXPR buffer(grow_fun grow, size_t sz) noexcept
-      : size_(sz), capacity_(sz), grow_(grow) {}
-
-  constexpr buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
-                   size_t cap = 0) noexcept
-      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
-
-  FMT_CONSTEXPR20 ~buffer() = default;
-  buffer(buffer&&) = default;
-
-  /// Sets the buffer data and capacity.
-  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
-    ptr_ = buf_data;
-    capacity_ = buf_capacity;
-  }
-
- public:
-  using value_type = T;
-  using const_reference = const T&;
-
-  buffer(const buffer&) = delete;
-  void operator=(const buffer&) = delete;
-
-  auto begin() noexcept -> T* { return ptr_; }
-  auto end() noexcept -> T* { return ptr_ + size_; }
-
-  auto begin() const noexcept -> const T* { return ptr_; }
-  auto end() const noexcept -> const T* { return ptr_ + size_; }
-
-  /// Returns the size of this buffer.
-  constexpr auto size() const noexcept -> size_t { return size_; }
-
-  /// Returns the capacity of this buffer.
-  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
-
-  /// Returns a pointer to the buffer data (not null-terminated).
-  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
-  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
-
-  /// Clears this buffer.
-  FMT_CONSTEXPR void clear() { size_ = 0; }
-
-  // Tries resizing the buffer to contain `count` elements. If T is a POD type
-  // the new elements may not be initialized.
-  FMT_CONSTEXPR void try_resize(size_t count) {
-    try_reserve(count);
-    size_ = min_of(count, capacity_);
-  }
-
-  // Tries increasing the buffer capacity to `new_capacity`. It can increase the
-  // capacity by a smaller amount than requested but guarantees there is space
-  // for at least one additional element either by increasing the capacity or by
-  // flushing the buffer if it is full.
-  FMT_CONSTEXPR void try_reserve(size_t new_capacity) {
-    if (new_capacity > capacity_) grow_(*this, new_capacity);
-  }
-
-  FMT_CONSTEXPR void push_back(const T& value) {
-    try_reserve(size_ + 1);
-    ptr_[size_++] = value;
-  }
-
-  /// Appends data to the end of the buffer.
-  template <typename U>
-// Workaround for MSVC2019 to fix error C2893: Failed to specialize function
-// template 'void fmt::v11::detail::buffer<T>::append(const U *,const U *)'.
-#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1940
-  FMT_CONSTEXPR20
-#endif
-      void
-      append(const U* begin, const U* end) {
-    while (begin != end) {
-      auto count = to_unsigned(end - begin);
-      try_reserve(size_ + count);
-      auto free_cap = capacity_ - size_;
-      if (free_cap < count) count = free_cap;
-      // A loop is faster than memcpy on small sizes.
-      T* out = ptr_ + size_;
-      for (size_t i = 0; i < count; ++i) out[i] = begin[i];
-      size_ += count;
-      begin += count;
-    }
-  }
-
-  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
-    return ptr_[index];
-  }
-  template <typename Idx>
-  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
-    return ptr_[index];
-  }
-};
-
-struct buffer_traits {
-  constexpr explicit buffer_traits(size_t) {}
-  constexpr auto count() const -> size_t { return 0; }
-  constexpr auto limit(size_t size) const -> size_t { return size; }
-};
-
-class fixed_buffer_traits {
- private:
-  size_t count_ = 0;
-  size_t limit_;
-
- public:
-  constexpr explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
-  constexpr auto count() const -> size_t { return count_; }
-  FMT_CONSTEXPR auto limit(size_t size) -> size_t {
-    size_t n = limit_ > count_ ? limit_ - count_ : 0;
-    count_ += size;
-    return min_of(size, n);
-  }
-};
-
-// A buffer that writes to an output iterator when flushed.
-template <typename OutputIt, typename T, typename Traits = buffer_traits>
-class iterator_buffer : public Traits, public buffer<T> {
- private:
-  OutputIt out_;
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-
-  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
-    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
-  }
-
-  void flush() {
-    auto size = this->size();
-    this->clear();
-    const T* begin = data_;
-    const T* end = begin + this->limit(size);
-    while (begin != end) *out_++ = *begin++;
-  }
-
- public:
-  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
-      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
-  iterator_buffer(iterator_buffer&& other) noexcept
-      : Traits(other),
-        buffer<T>(grow, data_, 0, buffer_size),
-        out_(other.out_) {}
-  ~iterator_buffer() {
-    // Don't crash if flush fails during unwinding.
-    FMT_TRY { flush(); }
-    FMT_CATCH(...) {}
-  }
-
-  auto out() -> OutputIt {
-    flush();
-    return out_;
-  }
-  auto count() const -> size_t { return Traits::count() + this->size(); }
-};
-
-template <typename T>
-class iterator_buffer<T*, T, fixed_buffer_traits> : public fixed_buffer_traits,
-                                                    public buffer<T> {
- private:
-  T* out_;
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-
-  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
-    if (buf.size() == buf.capacity())
-      static_cast<iterator_buffer&>(buf).flush();
-  }
-
-  void flush() {
-    size_t n = this->limit(this->size());
-    if (this->data() == out_) {
-      out_ += n;
-      this->set(data_, buffer_size);
-    }
-    this->clear();
-  }
-
- public:
-  explicit iterator_buffer(T* out, size_t n = buffer_size)
-      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
-  iterator_buffer(iterator_buffer&& other) noexcept
-      : fixed_buffer_traits(other),
-        buffer<T>(static_cast<iterator_buffer&&>(other)),
-        out_(other.out_) {
-    if (this->data() != out_) {
-      this->set(data_, buffer_size);
-      this->clear();
-    }
-  }
-  ~iterator_buffer() { flush(); }
-
-  auto out() -> T* {
-    flush();
-    return out_;
-  }
-  auto count() const -> size_t {
-    return fixed_buffer_traits::count() + this->size();
-  }
-};
-
-template <typename T> class iterator_buffer<T*, T> : public buffer<T> {
- public:
-  explicit iterator_buffer(T* out, size_t = 0)
-      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
-
-  auto out() -> T* { return &*this->end(); }
-};
-
-template <typename Container>
-class container_buffer : public buffer<typename Container::value_type> {
- private:
-  using value_type = typename Container::value_type;
-
-  static FMT_CONSTEXPR void grow(buffer<value_type>& buf, size_t capacity) {
-    auto& self = static_cast<container_buffer&>(buf);
-    self.container.resize(capacity);
-    self.set(&self.container[0], capacity);
-  }
-
- public:
-  Container& container;
-
-  explicit container_buffer(Container& c)
-      : buffer<value_type>(grow, c.size()), container(c) {}
-};
-
-// A buffer that writes to a container with the contiguous storage.
-template <typename OutputIt>
-class iterator_buffer<
-    OutputIt,
-    enable_if_t<is_back_insert_iterator<OutputIt>::value &&
-                    is_contiguous<typename OutputIt::container_type>::value,
-                typename OutputIt::container_type::value_type>>
-    : public container_buffer<typename OutputIt::container_type> {
- private:
-  using base = container_buffer<typename OutputIt::container_type>;
-
- public:
-  explicit iterator_buffer(typename OutputIt::container_type& c) : base(c) {}
-  explicit iterator_buffer(OutputIt out, size_t = 0)
-      : base(get_container(out)) {}
-
-  auto out() -> OutputIt { return OutputIt(this->container); }
-};
-
-// A buffer that counts the number of code units written discarding the output.
-template <typename T = char> class counting_buffer : public buffer<T> {
- private:
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-  size_t count_ = 0;
-
-  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
-    if (buf.size() != buffer_size) return;
-    static_cast<counting_buffer&>(buf).count_ += buf.size();
-    buf.clear();
-  }
-
- public:
-  FMT_CONSTEXPR counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
-
-  constexpr auto count() const noexcept -> size_t {
-    return count_ + this->size();
-  }
-};
-
-template <typename T>
-struct is_back_insert_iterator<basic_appender<T>> : std::true_type {};
-
-template <typename OutputIt, typename InputIt, typename = void>
-struct has_back_insert_iterator_container_append : std::false_type {};
-template <typename OutputIt, typename InputIt>
-struct has_back_insert_iterator_container_append<
-    OutputIt, InputIt,
-    void_t<decltype(get_container(std::declval<OutputIt>())
-                        .append(std::declval<InputIt>(),
-                                std::declval<InputIt>()))>> : std::true_type {};
-
-// An optimized version of std::copy with the output value type (T).
-template <typename T, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
-                            has_back_insert_iterator_container_append<
-                                OutputIt, InputIt>::value)>
-FMT_CONSTEXPR20 auto copy(InputIt begin, InputIt end, OutputIt out)
-    -> OutputIt {
-  get_container(out).append(begin, end);
-  return out;
-}
-
-template <typename T, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value &&
-                        !has_back_insert_iterator_container_append<
-                            OutputIt, InputIt>::value)>
-FMT_CONSTEXPR20 auto copy(InputIt begin, InputIt end, OutputIt out)
-    -> OutputIt {
-  auto& c = get_container(out);
-  c.insert(c.end(), begin, end);
-  return out;
-}
-
-template <typename T, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(!is_back_insert_iterator<OutputIt>::value)>
-FMT_CONSTEXPR auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
-  while (begin != end) *out++ = static_cast<T>(*begin++);
-  return out;
-}
-
-template <typename T, typename V, typename OutputIt>
-FMT_CONSTEXPR auto copy(basic_string_view<V> s, OutputIt out) -> OutputIt {
-  return copy<T>(s.begin(), s.end(), out);
-}
-
-template <typename It, typename Enable = std::true_type>
-struct is_buffer_appender : std::false_type {};
-template <typename It>
-struct is_buffer_appender<
-    It, bool_constant<
-            is_back_insert_iterator<It>::value &&
-            std::is_base_of<buffer<typename It::container_type::value_type>,
-                            typename It::container_type>::value>>
-    : std::true_type {};
-
-// Maps an output iterator to a buffer.
-template <typename T, typename OutputIt,
-          FMT_ENABLE_IF(!is_buffer_appender<OutputIt>::value)>
-auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
-  return iterator_buffer<OutputIt, T>(out);
-}
-template <typename T, typename OutputIt,
-          FMT_ENABLE_IF(is_buffer_appender<OutputIt>::value)>
-auto get_buffer(OutputIt out) -> buffer<T>& {
-  return get_container(out);
-}
-
-template <typename Buf, typename OutputIt>
-auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
-  return buf.out();
-}
-template <typename T, typename OutputIt>
-auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
-  return out;
-}
-
-// This type is intentionally undefined, only used for errors.
-template <typename T, typename Char> struct type_is_unformattable_for;
-
-template <typename Char> struct string_value {
-  const Char* data;
-  size_t size;
-  auto str() const -> basic_string_view<Char> { return {data, size}; }
-};
-
-template <typename Context> struct custom_value {
-  using char_type = typename Context::char_type;
-  void* value;
-  void (*format)(void* arg, parse_context<char_type>& parse_ctx, Context& ctx);
-};
-
-template <typename Char> struct named_arg_value {
-  const named_arg_info<Char>* data;
-  size_t size;
-};
-
-struct custom_tag {};
-
-#if !FMT_BUILTIN_TYPES
-#  define FMT_BUILTIN , monostate
-#else
-#  define FMT_BUILTIN
-#endif
-
-// A formatting argument value.
-template <typename Context> class value {
- public:
-  using char_type = typename Context::char_type;
-
-  union {
-    monostate no_value;
-    int int_value;
-    unsigned uint_value;
-    long long long_long_value;
-    unsigned long long ulong_long_value;
-    int128_opt int128_value;
-    uint128_opt uint128_value;
-    bool bool_value;
-    char_type char_value;
-    float float_value;
-    double double_value;
-    long double long_double_value;
-    const void* pointer;
-    string_value<char_type> string;
-    custom_value<Context> custom;
-    named_arg_value<char_type> named_args;
-  };
-
-  constexpr FMT_INLINE value() : no_value() {}
-  constexpr FMT_INLINE value(signed char x) : int_value(x) {}
-  constexpr FMT_INLINE value(unsigned char x FMT_BUILTIN) : uint_value(x) {}
-  constexpr FMT_INLINE value(signed short x) : int_value(x) {}
-  constexpr FMT_INLINE value(unsigned short x FMT_BUILTIN) : uint_value(x) {}
-  constexpr FMT_INLINE value(int x) : int_value(x) {}
-  constexpr FMT_INLINE value(unsigned x FMT_BUILTIN) : uint_value(x) {}
-  FMT_CONSTEXPR FMT_INLINE value(long x FMT_BUILTIN) : value(long_type(x)) {}
-  FMT_CONSTEXPR FMT_INLINE value(unsigned long x FMT_BUILTIN)
-      : value(ulong_type(x)) {}
-  constexpr FMT_INLINE value(long long x FMT_BUILTIN) : long_long_value(x) {}
-  constexpr FMT_INLINE value(unsigned long long x FMT_BUILTIN)
-      : ulong_long_value(x) {}
-  FMT_INLINE value(int128_opt x FMT_BUILTIN) : int128_value(x) {}
-  FMT_INLINE value(uint128_opt x FMT_BUILTIN) : uint128_value(x) {}
-  constexpr FMT_INLINE value(bool x FMT_BUILTIN) : bool_value(x) {}
-
-  template <int N>
-  constexpr FMT_INLINE value(bitint<N> x FMT_BUILTIN) : long_long_value(x) {
-    static_assert(N <= 64, "unsupported _BitInt");
-  }
-  template <int N>
-  constexpr FMT_INLINE value(ubitint<N> x FMT_BUILTIN) : ulong_long_value(x) {
-    static_assert(N <= 64, "unsupported _BitInt");
-  }
-
-  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
-  constexpr FMT_INLINE value(T x FMT_BUILTIN) : char_value(x) {
-    static_assert(
-        std::is_same<T, char>::value || std::is_same<T, char_type>::value,
-        "mixing character types is disallowed");
-  }
-
-  constexpr FMT_INLINE value(float x FMT_BUILTIN) : float_value(x) {}
-  constexpr FMT_INLINE value(double x FMT_BUILTIN) : double_value(x) {}
-  FMT_INLINE value(long double x FMT_BUILTIN) : long_double_value(x) {}
-
-  FMT_CONSTEXPR FMT_INLINE value(char_type* x FMT_BUILTIN) {
-    string.data = x;
-    if (is_constant_evaluated()) string.size = 0;
-  }
-  FMT_CONSTEXPR FMT_INLINE value(const char_type* x FMT_BUILTIN) {
-    string.data = x;
-    if (is_constant_evaluated()) string.size = 0;
-  }
-  template <typename T, typename C = char_t<T>,
-            FMT_ENABLE_IF(!std::is_pointer<T>::value)>
-  FMT_CONSTEXPR value(const T& x FMT_BUILTIN) {
-    static_assert(std::is_same<C, char_type>::value,
-                  "mixing character types is disallowed");
-    auto sv = to_string_view(x);
-    string.data = sv.data();
-    string.size = sv.size();
-  }
-  FMT_INLINE value(void* x FMT_BUILTIN) : pointer(x) {}
-  FMT_INLINE value(const void* x FMT_BUILTIN) : pointer(x) {}
-  FMT_INLINE value(volatile void* x FMT_BUILTIN)
-      : pointer(const_cast<const void*>(x)) {}
-  FMT_INLINE value(const volatile void* x FMT_BUILTIN)
-      : pointer(const_cast<const void*>(x)) {}
-  FMT_INLINE value(nullptr_t) : pointer(nullptr) {}
-
-  template <typename T, FMT_ENABLE_IF(std::is_pointer<T>::value ||
-                                      std::is_member_pointer<T>::value)>
-  value(const T&) {
-    // Formatting of arbitrary pointers is disallowed. If you want to format a
-    // pointer cast it to `void*` or `const void*`. In particular, this forbids
-    // formatting of `[const] volatile char*` printed as bool by iostreams.
-    static_assert(sizeof(T) == 0,
-                  "formatting of non-void pointers is disallowed");
-  }
-
-  template <typename T, FMT_ENABLE_IF(use_format_as<T>::value)>
-  value(const T& x) : value(format_as(x)) {}
-  template <typename T, FMT_ENABLE_IF(use_format_as_member<T>::value)>
-  value(const T& x) : value(formatter<T>::format_as(x)) {}
-
-  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
-  value(const T& named_arg) : value(named_arg.value) {}
-
-  template <typename T,
-            FMT_ENABLE_IF(use_formatter<T>::value || !FMT_BUILTIN_TYPES)>
-  FMT_CONSTEXPR20 FMT_INLINE value(T& x) : value(x, custom_tag()) {}
-
-  FMT_ALWAYS_INLINE value(const named_arg_info<char_type>* args, size_t size)
-      : named_args{args, size} {}
-
- private:
-  template <typename T, FMT_ENABLE_IF(has_formatter<T, char_type>())>
-  FMT_CONSTEXPR value(T& x, custom_tag) {
-    using value_type = remove_const_t<T>;
-    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
-    if (!is_constant_evaluated()) {
-      custom.value =
-          const_cast<char*>(&reinterpret_cast<const volatile char&>(x));
-    } else {
-      custom.value = nullptr;
-#if defined(__cpp_if_constexpr)
-      if constexpr (std::is_same<decltype(&x), remove_reference_t<T>*>::value)
-        custom.value = const_cast<value_type*>(&x);
-#endif
-    }
-    custom.format = format_custom<value_type, formatter<value_type, char_type>>;
-  }
-
-  template <typename T, FMT_ENABLE_IF(!has_formatter<T, char_type>())>
-  FMT_CONSTEXPR value(const T&, custom_tag) {
-    // Cannot format an argument; to make type T formattable provide a
-    // formatter<T> specialization: https://fmt.dev/latest/api.html#udt.
-    type_is_unformattable_for<T, char_type> _;
-  }
-
-  // Formats an argument of a custom type, such as a user-defined class.
-  template <typename T, typename Formatter>
-  static void format_custom(void* arg, parse_context<char_type>& parse_ctx,
-                            Context& ctx) {
-    auto f = Formatter();
-    parse_ctx.advance_to(f.parse(parse_ctx));
-    using qualified_type =
-        conditional_t<has_formatter<const T, char_type>(), const T, T>;
-    // format must be const for compatibility with std::format and compilation.
-    const auto& cf = f;
-    ctx.advance_to(cf.format(*static_cast<qualified_type*>(arg), ctx));
-  }
-};
-
-enum { packed_arg_bits = 4 };
-// Maximum number of arguments with packed types.
-enum { max_packed_args = 62 / packed_arg_bits };
-enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
-enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
-
-template <typename It, typename T, typename Enable = void>
-struct is_output_iterator : std::false_type {};
-
-template <> struct is_output_iterator<appender, char> : std::true_type {};
-
-template <typename It, typename T>
-struct is_output_iterator<
-    It, T,
-    void_t<decltype(*std::declval<decay_t<It>&>()++ = std::declval<T>())>>
-    : std::true_type {};
-
-#ifndef FMT_USE_LOCALE
-#  define FMT_USE_LOCALE (FMT_OPTIMIZE_SIZE <= 1)
-#endif
-
-// A type-erased reference to an std::locale to avoid a heavy <locale> include.
-struct locale_ref {
-#if FMT_USE_LOCALE
- private:
-  const void* locale_;  // A type-erased pointer to std::locale.
-
- public:
-  constexpr locale_ref() : locale_(nullptr) {}
-  template <typename Locale> locale_ref(const Locale& loc);
-
-  inline explicit operator bool() const noexcept { return locale_ != nullptr; }
-#endif  // FMT_USE_LOCALE
-
-  template <typename Locale> auto get() const -> Locale;
-};
-
-template <typename> constexpr auto encode_types() -> unsigned long long {
-  return 0;
-}
-
-template <typename Context, typename Arg, typename... Args>
-constexpr auto encode_types() -> unsigned long long {
-  return static_cast<unsigned>(stored_type_constant<Arg, Context>::value) |
-         (encode_types<Context, Args...>() << packed_arg_bits);
-}
-
-template <typename Context, typename... T, size_t NUM_ARGS = sizeof...(T)>
-constexpr auto make_descriptor() -> unsigned long long {
-  return NUM_ARGS <= max_packed_args ? encode_types<Context, T...>()
-                                     : is_unpacked_bit | NUM_ARGS;
-}
-
-template <typename Context, int NUM_ARGS>
-using arg_t = conditional_t<NUM_ARGS <= max_packed_args, value<Context>,
-                            basic_format_arg<Context>>;
-
-template <typename Context, int NUM_ARGS, int NUM_NAMED_ARGS,
-          unsigned long long DESC>
-struct named_arg_store {
-  // args_[0].named_args points to named_args to avoid bloating format_args.
-  arg_t<Context, NUM_ARGS> args[1 + NUM_ARGS];
-  named_arg_info<typename Context::char_type> named_args[NUM_NAMED_ARGS];
-
-  template <typename... T>
-  FMT_CONSTEXPR FMT_ALWAYS_INLINE named_arg_store(T&... values)
-      : args{{named_args, NUM_NAMED_ARGS}, values...} {
-    int arg_index = 0, named_arg_index = 0;
-    FMT_APPLY_VARIADIC(
-        init_named_arg(named_args, arg_index, named_arg_index, values));
-  }
-
-  named_arg_store(named_arg_store&& rhs) {
-    args[0] = {named_args, NUM_NAMED_ARGS};
-    for (size_t i = 1; i < sizeof(args) / sizeof(*args); ++i)
-      args[i] = rhs.args[i];
-    for (size_t i = 0; i < NUM_NAMED_ARGS; ++i)
-      named_args[i] = rhs.named_args[i];
-  }
-
-  named_arg_store(const named_arg_store& rhs) = delete;
-  named_arg_store& operator=(const named_arg_store& rhs) = delete;
-  named_arg_store& operator=(named_arg_store&& rhs) = delete;
-  operator const arg_t<Context, NUM_ARGS>*() const { return args + 1; }
-};
-
-// An array of references to arguments. It can be implicitly converted to
-// `basic_format_args` for passing into type-erased formatting functions
-// such as `vformat`. It is a plain struct to reduce binary size in debug mode.
-template <typename Context, int NUM_ARGS, int NUM_NAMED_ARGS,
-          unsigned long long DESC>
-struct format_arg_store {
-  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
-  using type =
-      conditional_t<NUM_NAMED_ARGS == 0,
-                    arg_t<Context, NUM_ARGS>[max_of(1, NUM_ARGS)],
-                    named_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>>;
-  type args;
-};
-
-// TYPE can be different from type_constant<T>, e.g. for __float128.
-template <typename T, typename Char, type TYPE> struct native_formatter {
- private:
-  dynamic_format_specs<Char> specs_;
-
- public:
-  using nonlocking = void;
-
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
-    auto end = parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, TYPE);
-    if (const_check(TYPE == type::char_type)) check_char_specs(specs_);
-    return end;
-  }
-
-  template <type U = TYPE,
-            FMT_ENABLE_IF(U == type::string_type || U == type::cstring_type ||
-                          U == type::char_type)>
-  FMT_CONSTEXPR void set_debug_format(bool set = true) {
-    specs_.set_type(set ? presentation_type::debug : presentation_type::none);
-  }
-
-  FMT_PRAGMA_CLANG(diagnostic ignored "-Wundefined-inline")
-  template <typename FormatContext>
-  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
-      -> decltype(ctx.out());
-};
-
-template <typename T, typename Enable = void>
-struct locking
-    : bool_constant<mapped_type_constant<T>::value == type::custom_type> {};
-template <typename T>
-struct locking<T, void_t<typename formatter<remove_cvref_t<T>>::nonlocking>>
-    : std::false_type {};
-
-template <typename T = int> FMT_CONSTEXPR inline auto is_locking() -> bool {
-  return locking<T>::value;
-}
-template <typename T1, typename T2, typename... Tail>
-FMT_CONSTEXPR inline auto is_locking() -> bool {
-  return locking<T1>::value || is_locking<T2, Tail...>();
-}
-
-FMT_API void vformat_to(buffer<char>& buf, string_view fmt, format_args args,
-                        locale_ref loc = {});
-
-#if FMT_WIN32
-FMT_API void vprint_mojibake(FILE*, string_view, format_args, bool);
-#else  // format_args is passed by reference since it is defined later.
-inline void vprint_mojibake(FILE*, string_view, const format_args&, bool) {}
-#endif
-}  // namespace detail
-
-// The main public API.
-
-template <typename Char>
-FMT_CONSTEXPR void parse_context<Char>::do_check_arg_id(int arg_id) {
-  // Argument id is only checked at compile time during parsing because
-  // formatting has its own validation.
-  if (detail::is_constant_evaluated() && use_constexpr_cast) {
-    auto ctx = static_cast<detail::compile_parse_context<Char>*>(this);
-    if (arg_id >= ctx->num_args()) report_error("argument not found");
-  }
-}
-
-template <typename Char>
-FMT_CONSTEXPR void parse_context<Char>::check_dynamic_spec(int arg_id) {
-  using detail::compile_parse_context;
-  if (detail::is_constant_evaluated() && use_constexpr_cast)
-    static_cast<compile_parse_context<Char>*>(this)->check_dynamic_spec(arg_id);
-}
-
-FMT_BEGIN_EXPORT
-
-// An output iterator that appends to a buffer. It is used instead of
-// back_insert_iterator to reduce symbol sizes and avoid <iterator> dependency.
-template <typename T> class basic_appender {
- protected:
-  detail::buffer<T>* container;
-
- public:
-  using container_type = detail::buffer<T>;
-
-  FMT_CONSTEXPR basic_appender(detail::buffer<T>& buf) : container(&buf) {}
-
-  FMT_CONSTEXPR20 auto operator=(T c) -> basic_appender& {
-    container->push_back(c);
-    return *this;
-  }
-  FMT_CONSTEXPR20 auto operator*() -> basic_appender& { return *this; }
-  FMT_CONSTEXPR20 auto operator++() -> basic_appender& { return *this; }
-  FMT_CONSTEXPR20 auto operator++(int) -> basic_appender { return *this; }
-};
-
-// A formatting argument. Context is a template parameter for the compiled API
-// where output can be unbuffered.
-template <typename Context> class basic_format_arg {
- private:
-  detail::value<Context> value_;
-  detail::type type_;
-
-  friend class basic_format_args<Context>;
-
-  using char_type = typename Context::char_type;
-
- public:
-  class handle {
-   private:
-    detail::custom_value<Context> custom_;
-
-   public:
-    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
-
-    void format(parse_context<char_type>& parse_ctx, Context& ctx) const {
-      custom_.format(custom_.value, parse_ctx, ctx);
-    }
-  };
-
-  constexpr basic_format_arg() : type_(detail::type::none_type) {}
-  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
-      : value_(args, size) {}
-  template <typename T>
-  basic_format_arg(T&& val)
-      : value_(val), type_(detail::stored_type_constant<T, Context>::value) {}
-
-  constexpr explicit operator bool() const noexcept {
-    return type_ != detail::type::none_type;
-  }
-  auto type() const -> detail::type { return type_; }
-
-  /**
-   * Visits an argument dispatching to the appropriate visit method based on
-   * the argument type. For example, if the argument type is `double` then
-   * `vis(value)` will be called with the value of type `double`.
-   */
-  template <typename Visitor>
-  FMT_CONSTEXPR FMT_INLINE auto visit(Visitor&& vis) const -> decltype(vis(0)) {
-    using detail::map;
-    switch (type_) {
-    case detail::type::none_type:        break;
-    case detail::type::int_type:         return vis(value_.int_value);
-    case detail::type::uint_type:        return vis(value_.uint_value);
-    case detail::type::long_long_type:   return vis(value_.long_long_value);
-    case detail::type::ulong_long_type:  return vis(value_.ulong_long_value);
-    case detail::type::int128_type:      return vis(map(value_.int128_value));
-    case detail::type::uint128_type:     return vis(map(value_.uint128_value));
-    case detail::type::bool_type:        return vis(value_.bool_value);
-    case detail::type::char_type:        return vis(value_.char_value);
-    case detail::type::float_type:       return vis(value_.float_value);
-    case detail::type::double_type:      return vis(value_.double_value);
-    case detail::type::long_double_type: return vis(value_.long_double_value);
-    case detail::type::cstring_type:     return vis(value_.string.data);
-    case detail::type::string_type:      return vis(value_.string.str());
-    case detail::type::pointer_type:     return vis(value_.pointer);
-    case detail::type::custom_type:      return vis(handle(value_.custom));
-    }
-    return vis(monostate());
-  }
-
-  auto format_custom(const char_type* parse_begin,
-                     parse_context<char_type>& parse_ctx, Context& ctx)
-      -> bool {
-    if (type_ != detail::type::custom_type) return false;
-    parse_ctx.advance_to(parse_begin);
-    value_.custom.format(value_.custom.value, parse_ctx, ctx);
-    return true;
-  }
-};
-
-/**
- * A view of a collection of formatting arguments. To avoid lifetime issues it
- * should only be used as a parameter type in type-erased functions such as
- * `vformat`:
- *
- *     void vlog(fmt::string_view fmt, fmt::format_args args);  // OK
- *     fmt::format_args args = fmt::make_format_args();  // Dangling reference
- */
-template <typename Context> class basic_format_args {
- private:
-  // A descriptor that contains information about formatting arguments.
-  // If the number of arguments is less or equal to max_packed_args then
-  // argument types are passed in the descriptor. This reduces binary code size
-  // per formatting function call.
-  unsigned long long desc_;
-  union {
-    // If is_packed() returns true then argument values are stored in values_;
-    // otherwise they are stored in args_. This is done to improve cache
-    // locality and reduce compiled code size since storing larger objects
-    // may require more code (at least on x86-64) even if the same amount of
-    // data is actually copied to stack. It saves ~10% on the bloat test.
-    const detail::value<Context>* values_;
-    const basic_format_arg<Context>* args_;
-  };
-
-  constexpr auto is_packed() const -> bool {
-    return (desc_ & detail::is_unpacked_bit) == 0;
-  }
-  constexpr auto has_named_args() const -> bool {
-    return (desc_ & detail::has_named_args_bit) != 0;
-  }
-
-  FMT_CONSTEXPR auto type(int index) const -> detail::type {
-    int shift = index * detail::packed_arg_bits;
-    unsigned mask = (1 << detail::packed_arg_bits) - 1;
-    return static_cast<detail::type>((desc_ >> shift) & mask);
-  }
-
-  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC>
-  using store =
-      detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>;
-
- public:
-  using format_arg = basic_format_arg<Context>;
-
-  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
-
-  /// Constructs a `basic_format_args` object from `format_arg_store`.
-  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC,
-            FMT_ENABLE_IF(NUM_ARGS <= detail::max_packed_args)>
-  constexpr FMT_ALWAYS_INLINE basic_format_args(
-      const store<NUM_ARGS, NUM_NAMED_ARGS, DESC>& s)
-      : desc_(DESC | (NUM_NAMED_ARGS != 0 ? +detail::has_named_args_bit : 0)),
-        values_(s.args) {}
-
-  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC,
-            FMT_ENABLE_IF(NUM_ARGS > detail::max_packed_args)>
-  constexpr basic_format_args(const store<NUM_ARGS, NUM_NAMED_ARGS, DESC>& s)
-      : desc_(DESC | (NUM_NAMED_ARGS != 0 ? +detail::has_named_args_bit : 0)),
-        args_(s.args) {}
-
-  /// Constructs a `basic_format_args` object from a dynamic list of arguments.
-  constexpr basic_format_args(const format_arg* args, int count,
-                              bool has_named = false)
-      : desc_(detail::is_unpacked_bit | detail::to_unsigned(count) |
-              (has_named ? +detail::has_named_args_bit : 0)),
-        args_(args) {}
-
-  /// Returns the argument with the specified id.
-  FMT_CONSTEXPR auto get(int id) const -> format_arg {
-    auto arg = format_arg();
-    if (!is_packed()) {
-      if (id < max_size()) arg = args_[id];
-      return arg;
-    }
-    if (static_cast<unsigned>(id) >= detail::max_packed_args) return arg;
-    arg.type_ = type(id);
-    if (arg.type_ != detail::type::none_type) arg.value_ = values_[id];
-    return arg;
-  }
-
-  template <typename Char>
-  auto get(basic_string_view<Char> name) const -> format_arg {
-    int id = get_id(name);
-    return id >= 0 ? get(id) : format_arg();
-  }
-
-  template <typename Char>
-  FMT_CONSTEXPR auto get_id(basic_string_view<Char> name) const -> int {
-    if (!has_named_args()) return -1;
-    const auto& named_args =
-        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
-    for (size_t i = 0; i < named_args.size; ++i) {
-      if (named_args.data[i].name == name) return named_args.data[i].id;
-    }
-    return -1;
-  }
-
-  auto max_size() const -> int {
-    unsigned long long max_packed = detail::max_packed_args;
-    return static_cast<int>(is_packed() ? max_packed
-                                        : desc_ & ~detail::is_unpacked_bit);
-  }
-};
-
-// A formatting context.
-class context {
- private:
-  appender out_;
-  format_args args_;
-  FMT_NO_UNIQUE_ADDRESS detail::locale_ref loc_;
-
- public:
-  /// The character type for the output.
-  using char_type = char;
-
-  using iterator = appender;
-  using format_arg = basic_format_arg<context>;
-  using parse_context_type FMT_DEPRECATED = parse_context<>;
-  template <typename T> using formatter_type FMT_DEPRECATED = formatter<T>;
-  enum { builtin_types = FMT_BUILTIN_TYPES };
-
-  /// Constructs a `context` object. References to the arguments are stored
-  /// in the object so make sure they have appropriate lifetimes.
-  FMT_CONSTEXPR context(iterator out, format_args args,
-                        detail::locale_ref loc = {})
-      : out_(out), args_(args), loc_(loc) {}
-  context(context&&) = default;
-  context(const context&) = delete;
-  void operator=(const context&) = delete;
-
-  FMT_CONSTEXPR auto arg(int id) const -> format_arg { return args_.get(id); }
-  inline auto arg(string_view name) const -> format_arg {
-    return args_.get(name);
-  }
-  FMT_CONSTEXPR auto arg_id(string_view name) const -> int {
-    return args_.get_id(name);
-  }
-
-  // Returns an iterator to the beginning of the output range.
-  FMT_CONSTEXPR auto out() const -> iterator { return out_; }
-
-  // Advances the begin iterator to `it`.
-  FMT_CONSTEXPR void advance_to(iterator) {}
-
-  FMT_CONSTEXPR auto locale() const -> detail::locale_ref { return loc_; }
-};
-
-template <typename Char = char> struct runtime_format_string {
-  basic_string_view<Char> str;
-};
-
-/**
- * Creates a runtime format string.
- *
- * **Example**:
- *
- *     // Check format string at runtime instead of compile-time.
- *     fmt::print(fmt::runtime("{:d}"), "I am not a number");
- */
-inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
-
-/// A compile-time format string. Use `format_string` in the public API to
-/// prevent type deduction.
-template <typename... T> struct fstring {
- private:
-  static constexpr int num_static_named_args =
-      detail::count_static_named_args<T...>();
-
-  using checker = detail::format_string_checker<
-      char, static_cast<int>(sizeof...(T)), num_static_named_args,
-      num_static_named_args != detail::count_named_args<T...>()>;
-
-  using arg_pack = detail::arg_pack<T...>;
-
- public:
-  string_view str;
-  using t = fstring;
-
-  // Reports a compile-time error if S is not a valid format string for T.
-  template <size_t N>
-  FMT_CONSTEVAL FMT_ALWAYS_INLINE fstring(const char (&s)[N]) : str(s, N - 1) {
-    using namespace detail;
-    static_assert(count<(std::is_base_of<view, remove_reference_t<T>>::value &&
-                         std::is_reference<T>::value)...>() == 0,
-                  "passing views as lvalues is disallowed");
-    if (FMT_USE_CONSTEVAL) parse_format_string<char>(s, checker(s, arg_pack()));
-#ifdef FMT_ENFORCE_COMPILE_STRING
-    static_assert(
-        FMT_USE_CONSTEVAL && sizeof(s) != 0,
-        "FMT_ENFORCE_COMPILE_STRING requires format strings to use FMT_STRING");
-#endif
-  }
-  template <typename S,
-            FMT_ENABLE_IF(std::is_convertible<const S&, string_view>::value)>
-  FMT_CONSTEVAL FMT_ALWAYS_INLINE fstring(const S& s) : str(s) {
-    auto sv = string_view(str);
-    if (FMT_USE_CONSTEVAL)
-      detail::parse_format_string<char>(sv, checker(sv, arg_pack()));
-#ifdef FMT_ENFORCE_COMPILE_STRING
-    static_assert(
-        FMT_USE_CONSTEVAL && sizeof(s) != 0,
-        "FMT_ENFORCE_COMPILE_STRING requires format strings to use FMT_STRING");
-#endif
-  }
-  template <typename S,
-            FMT_ENABLE_IF(std::is_base_of<detail::compile_string, S>::value&&
-                              std::is_same<typename S::char_type, char>::value)>
-  FMT_ALWAYS_INLINE fstring(const S&) : str(S()) {
-    FMT_CONSTEXPR auto sv = string_view(S());
-    FMT_CONSTEXPR int ignore =
-        (parse_format_string(sv, checker(sv, arg_pack())), 0);
-    detail::ignore_unused(ignore);
-  }
-  fstring(runtime_format_string<> fmt) : str(fmt.str) {}
-
-  // Returning by reference generates better code in debug mode.
-  FMT_ALWAYS_INLINE operator const string_view&() const { return str; }
-  auto get() const -> string_view { return str; }
-};
-
-template <typename... T> using format_string = typename fstring<T...>::t;
-
-template <typename T, typename Char = char>
-using is_formattable = bool_constant<!std::is_same<
-    detail::mapped_t<conditional_t<std::is_void<T>::value, int*, T>, Char>,
-    void>::value>;
-#ifdef __cpp_concepts
-template <typename T, typename Char = char>
-concept formattable = is_formattable<remove_reference_t<T>, Char>::value;
-#endif
-
-template <typename T, typename Char>
-using has_formatter FMT_DEPRECATED = std::is_constructible<formatter<T, Char>>;
-
-// A formatter specialization for natively supported types.
-template <typename T, typename Char>
-struct formatter<T, Char,
-                 enable_if_t<detail::type_constant<T, Char>::value !=
-                             detail::type::custom_type>>
-    : detail::native_formatter<T, Char, detail::type_constant<T, Char>::value> {
-};
-
-/**
- * Constructs an object that stores references to arguments and can be
- * implicitly converted to `format_args`. `Context` can be omitted in which case
- * it defaults to `context`. See `arg` for lifetime considerations.
- */
-// Take arguments by lvalue references to avoid some lifetime issues, e.g.
-//   auto args = make_format_args(std::string());
-template <typename Context = context, typename... T,
-          int NUM_ARGS = sizeof...(T),
-          int NUM_NAMED_ARGS = detail::count_named_args<T...>(),
-          unsigned long long DESC = detail::make_descriptor<Context, T...>()>
-constexpr FMT_ALWAYS_INLINE auto make_format_args(T&... args)
-    -> detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC> {
-  // Suppress warnings for pathological types convertible to detail::value.
-  FMT_PRAGMA_GCC(diagnostic ignored "-Wconversion")
-  return {{args...}};
-}
-
-template <typename... T>
-using vargs =
-    detail::format_arg_store<context, sizeof...(T),
-                             detail::count_named_args<T...>(),
-                             detail::make_descriptor<context, T...>()>;
-
-/**
- * Returns a named argument to be used in a formatting function.
- * It should only be used in a call to a formatting function.
- *
- * **Example**:
- *
- *     fmt::print("The answer is {answer}.", fmt::arg("answer", 42));
- */
-template <typename Char, typename T>
-inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
-  return {name, arg};
-}
-
-/// Formats a string and writes the output to `out`.
-template <typename OutputIt,
-          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
-                                                   char>::value)>
-auto vformat_to(OutputIt&& out, string_view fmt, format_args args)
-    -> remove_cvref_t<OutputIt> {
-  auto&& buf = detail::get_buffer<char>(out);
-  detail::vformat_to(buf, fmt, args, {});
-  return detail::get_iterator(buf, out);
-}
-
-/**
- * Formats `args` according to specifications in `fmt`, writes the result to
- * the output iterator `out` and returns the iterator past the end of the output
- * range. `format_to` does not append a terminating null character.
- *
- * **Example**:
- *
- *     auto out = std::vector<char>();
- *     fmt::format_to(std::back_inserter(out), "{}", 42);
- */
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
-                                                   char>::value)>
-FMT_INLINE auto format_to(OutputIt&& out, format_string<T...> fmt, T&&... args)
-    -> remove_cvref_t<OutputIt> {
-  return vformat_to(out, fmt.str, vargs<T...>{{args...}});
-}
-
-template <typename OutputIt> struct format_to_n_result {
-  /// Iterator past the end of the output range.
-  OutputIt out;
-  /// Total (not truncated) output size.
-  size_t size;
-};
-
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
-    -> format_to_n_result<OutputIt> {
-  using traits = detail::fixed_buffer_traits;
-  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
-  detail::vformat_to(buf, fmt, args, {});
-  return {buf.out(), buf.count()};
-}
-
-/**
- * Formats `args` according to specifications in `fmt`, writes up to `n`
- * characters of the result to the output iterator `out` and returns the total
- * (not truncated) output size and the iterator past the end of the output
- * range. `format_to_n` does not append a terminating null character.
- */
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
-                            T&&... args) -> format_to_n_result<OutputIt> {
-  return vformat_to_n(out, n, fmt.str, vargs<T...>{{args...}});
-}
-
-struct format_to_result {
-  /// Pointer to just after the last successful write in the array.
-  char* out;
-  /// Specifies if the output was truncated.
-  bool truncated;
-
-  FMT_CONSTEXPR operator char*() const {
-    // Report truncation to prevent silent data loss.
-    if (truncated) report_error("output is truncated");
-    return out;
-  }
-};
-
-template <size_t N>
-auto vformat_to(char (&out)[N], string_view fmt, format_args args)
-    -> format_to_result {
-  auto result = vformat_to_n(out, N, fmt, args);
-  return {result.out, result.size > N};
-}
-
-template <size_t N, typename... T>
-FMT_INLINE auto format_to(char (&out)[N], format_string<T...> fmt, T&&... args)
-    -> format_to_result {
-  auto result = vformat_to_n(out, N, fmt.str, vargs<T...>{{args...}});
-  return {result.out, result.size > N};
-}
-
-/// Returns the number of chars in the output of `format(fmt, args...)`.
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
-                                             T&&... args) -> size_t {
-  auto buf = detail::counting_buffer<>();
-  detail::vformat_to(buf, fmt.str, vargs<T...>{{args...}}, {});
-  return buf.count();
-}
-
-FMT_API void vprint(string_view fmt, format_args args);
-FMT_API void vprint(FILE* f, string_view fmt, format_args args);
-FMT_API void vprintln(FILE* f, string_view fmt, format_args args);
-FMT_API void vprint_buffered(FILE* f, string_view fmt, format_args args);
-
-/**
- * Formats `args` according to specifications in `fmt` and writes the output
- * to `stdout`.
- *
- * **Example**:
- *
- *     fmt::print("The answer is {}.", 42);
- */
-template <typename... T>
-FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
-  vargs<T...> va = {{args...}};
-  if (detail::const_check(!detail::use_utf8))
-    return detail::vprint_mojibake(stdout, fmt.str, va, false);
-  return detail::is_locking<T...>() ? vprint_buffered(stdout, fmt.str, va)
-                                    : vprint(fmt.str, va);
-}
-
-/**
- * Formats `args` according to specifications in `fmt` and writes the
- * output to the file `f`.
- *
- * **Example**:
- *
- *     fmt::print(stderr, "Don't {}!", "panic");
- */
-template <typename... T>
-FMT_INLINE void print(FILE* f, format_string<T...> fmt, T&&... args) {
-  vargs<T...> va = {{args...}};
-  if (detail::const_check(!detail::use_utf8))
-    return detail::vprint_mojibake(f, fmt.str, va, false);
-  return detail::is_locking<T...>() ? vprint_buffered(f, fmt.str, va)
-                                    : vprint(f, fmt.str, va);
-}
-
-/// Formats `args` according to specifications in `fmt` and writes the output
-/// to the file `f` followed by a newline.
-template <typename... T>
-FMT_INLINE void println(FILE* f, format_string<T...> fmt, T&&... args) {
-  vargs<T...> va = {{args...}};
-  return detail::const_check(detail::use_utf8)
-             ? vprintln(f, fmt.str, va)
-             : detail::vprint_mojibake(f, fmt.str, va, true);
-}
-
-/// Formats `args` according to specifications in `fmt` and writes the output
-/// to `stdout` followed by a newline.
-template <typename... T>
-FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
-  return fmt::println(stdout, fmt, static_cast<T&&>(args)...);
-}
-
-FMT_END_EXPORT
-FMT_PRAGMA_CLANG(diagnostic pop)
-FMT_PRAGMA_GCC(pop_options)
-FMT_END_NAMESPACE
-
-#ifdef FMT_HEADER_ONLY
-#  include "format.h"
-#endif
-#endif  // FMT_BASE_H_
diff --git a/src/fmt/chrono.h b/src/fmt/chrono.h
index 50c777c841..9d54574e16 100644
--- a/src/fmt/chrono.h
+++ b/src/fmt/chrono.h
@@ -8,37 +8,52 @@
 #ifndef FMT_CHRONO_H_
 #define FMT_CHRONO_H_
 
-#ifndef FMT_MODULE
-#  include <algorithm>
-#  include <chrono>
-#  include <cmath>    // std::isfinite
-#  include <cstring>  // std::memcpy
-#  include <ctime>
-#  include <iterator>
-#  include <locale>
-#  include <ostream>
-#  include <type_traits>
-#endif
+#include <algorithm>
+#include <chrono>
+#include <cmath>    // std::isfinite
+#include <cstring>  // std::memcpy
+#include <ctime>
+#include <iterator>
+#include <locale>
+#include <ostream>
+#include <type_traits>
 
-#include "format.h"
-
-namespace fmt_detail {
-struct time_zone {
-  template <typename Duration, typename T>
-  auto to_sys(T)
-      -> std::chrono::time_point<std::chrono::system_clock, Duration> {
-    return {};
-  }
-};
-template <typename... T> inline auto current_zone(T...) -> time_zone* {
-  return nullptr;
-}
-
-template <typename... T> inline void _tzset(T...) {}
-}  // namespace fmt_detail
+#include "ostream.h"  // formatbuf
 
 FMT_BEGIN_NAMESPACE
 
+// Check if std::chrono::local_t is available.
+#ifndef FMT_USE_LOCAL_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_LOCAL_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_LOCAL_TIME 0
+#  endif
+#endif
+
+// Check if std::chrono::utc_timestamp is available.
+#ifndef FMT_USE_UTC_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_UTC_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_UTC_TIME 0
+#  endif
+#endif
+
+// Enable tzset.
+#ifndef FMT_USE_TZSET
+// UWP doesn't provide _tzset.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if defined(_WIN32) && (!defined(WINAPI_FAMILY) || \
+                          (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    define FMT_USE_TZSET 1
+#  else
+#    define FMT_USE_TZSET 0
+#  endif
+#endif
+
 // Enable safe chrono durations, unless explicitly disabled.
 #ifndef FMT_SAFE_DURATION_CAST
 #  define FMT_SAFE_DURATION_CAST 1
@@ -79,8 +94,10 @@ FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
   return static_cast<To>(from);
 }
 
-/// Converts From to To, without loss. If the dynamic value of from
-/// can't be converted to To without loss, ec is set.
+/**
+ * converts From to To, without loss. If the dynamic value of from
+ * can't be converted to To without loss, ec is set.
+ */
 template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value &&
                         std::numeric_limits<From>::is_signed !=
@@ -168,7 +185,61 @@ FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
   return from;
 }
 
-/// Safe duration_cast between floating point durations
+/**
+ * safe duration cast between integral durations
+ */
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_integral<FromRep>::value),
+          FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // safe conversion to IntermediateRep
+  IntermediateRep count =
+      lossless_integral_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) return {};
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    const auto max1 = detail::max_value<IntermediateRep>() / Factor::num;
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    const auto min1 =
+        (std::numeric_limits<IntermediateRep>::min)() / Factor::num;
+    if (detail::const_check(!std::is_unsigned<IntermediateRep>::value) &&
+        count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= Factor::num;
+  }
+
+  if (detail::const_check(Factor::den != 1)) count /= Factor::den;
+  auto tocount = lossless_integral_conversion<typename To::rep>(count, ec);
+  return ec ? To() : To(tocount);
+}
+
+/**
+ * safe duration_cast between floating point durations
+ */
 template <typename To, typename FromRep, typename FromPeriod,
           FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
           FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
@@ -247,94 +318,17 @@ auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
 }  // namespace safe_duration_cast
 #endif
 
-namespace detail {
-
-// Check if std::chrono::utc_time is available.
-#ifdef FMT_USE_UTC_TIME
-// Use the provided definition.
-#elif defined(__cpp_lib_chrono)
-#  define FMT_USE_UTC_TIME (__cpp_lib_chrono >= 201907L)
-#else
-#  define FMT_USE_UTC_TIME 0
-#endif
-#if FMT_USE_UTC_TIME
-using utc_clock = std::chrono::utc_clock;
-#else
-struct utc_clock {
-  template <typename T> void to_sys(T);
-};
-#endif
-
-// Check if std::chrono::local_time is available.
-#ifdef FMT_USE_LOCAL_TIME
-// Use the provided definition.
-#elif defined(__cpp_lib_chrono)
-#  define FMT_USE_LOCAL_TIME (__cpp_lib_chrono >= 201907L)
-#else
-#  define FMT_USE_LOCAL_TIME 0
-#endif
-#if FMT_USE_LOCAL_TIME
-using local_t = std::chrono::local_t;
-#else
-struct local_t {};
-#endif
-
-}  // namespace detail
-
-template <typename Duration>
-using sys_time = std::chrono::time_point<std::chrono::system_clock, Duration>;
-
-template <typename Duration>
-using utc_time = std::chrono::time_point<detail::utc_clock, Duration>;
-
-template <class Duration>
-using local_time = std::chrono::time_point<detail::local_t, Duration>;
-
-namespace detail {
-
 // Prevents expansion of a preceding token as a function-style macro.
 // Usage: f FMT_NOMACRO()
 #define FMT_NOMACRO
 
+namespace detail {
 template <typename T = void> struct null {};
 inline auto localtime_r FMT_NOMACRO(...) -> null<> { return null<>(); }
 inline auto localtime_s(...) -> null<> { return null<>(); }
 inline auto gmtime_r(...) -> null<> { return null<>(); }
 inline auto gmtime_s(...) -> null<> { return null<>(); }
 
-// It is defined here and not in ostream.h because the latter has expensive
-// includes.
-template <typename StreamBuf> class formatbuf : public StreamBuf {
- private:
-  using char_type = typename StreamBuf::char_type;
-  using streamsize = decltype(std::declval<StreamBuf>().sputn(nullptr, 0));
-  using int_type = typename StreamBuf::int_type;
-  using traits_type = typename StreamBuf::traits_type;
-
-  buffer<char_type>& buffer_;
-
- public:
-  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
-
- protected:
-  // The put area is always empty. This makes the implementation simpler and has
-  // the advantage that the streambuf and the buffer are always in sync and
-  // sputc never writes into uninitialized memory. A disadvantage is that each
-  // call to sputc always results in a (virtual) call to overflow. There is no
-  // disadvantage here for sputn since this always results in a call to xsputn.
-
-  auto overflow(int_type ch) -> int_type override {
-    if (!traits_type::eq_int_type(ch, traits_type::eof()))
-      buffer_.push_back(static_cast<char_type>(ch));
-    return ch;
-  }
-
-  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
-    buffer_.append(s, s + count);
-    return count;
-  }
-};
-
 inline auto get_classic_locale() -> const std::locale& {
   static const auto& locale = std::locale::classic();
   return locale;
@@ -347,16 +341,20 @@ template <typename CodeUnit> struct codecvt_result {
 };
 
 template <typename CodeUnit>
-void write_codecvt(codecvt_result<CodeUnit>& out, string_view in,
+void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
                    const std::locale& loc) {
-  FMT_PRAGMA_CLANG(diagnostic push)
-  FMT_PRAGMA_CLANG(diagnostic ignored "-Wdeprecated")
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wdeprecated"
   auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
-  FMT_PRAGMA_CLANG(diagnostic pop)
+#  pragma clang diagnostic pop
+#else
+  auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
+#endif
   auto mb = std::mbstate_t();
   const char* from_next = nullptr;
-  auto result = f.in(mb, in.begin(), in.end(), from_next, std::begin(out.buf),
-                     std::end(out.buf), out.end);
+  auto result = f.in(mb, in_buf.begin(), in_buf.end(), from_next,
+                     std::begin(out.buf), std::end(out.buf), out.end);
   if (result != std::codecvt_base::ok)
     FMT_THROW(format_error("failed to format time"));
 }
@@ -364,12 +362,11 @@ void write_codecvt(codecvt_result<CodeUnit>& out, string_view in,
 template <typename OutputIt>
 auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
     -> OutputIt {
-  if (const_check(detail::use_utf8) && loc != get_classic_locale()) {
+  if (detail::is_utf8() && loc != get_classic_locale()) {
     // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
     // gcc-4.
-#if FMT_MSC_VERSION != 0 ||  \
-    (defined(__GLIBCXX__) && \
-     (!defined(_GLIBCXX_USE_DUAL_ABI) || _GLIBCXX_USE_DUAL_ABI == 0))
+#if FMT_MSC_VERSION != 0 || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI))
     // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
     // and newer.
     using code_unit = wchar_t;
@@ -385,9 +382,9 @@ auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
         to_utf8<code_unit, basic_memory_buffer<char, unit_t::max_size * 4>>();
     if (!u.convert({unit.buf, to_unsigned(unit.end - unit.buf)}))
       FMT_THROW(format_error("failed to format time"));
-    return copy<char>(u.c_str(), u.c_str() + u.size(), out);
+    return copy_str<char>(u.c_str(), u.c_str() + u.size(), out);
   }
-  return copy<char>(in.data(), in.data() + in.size(), out);
+  return copy_str<char>(in.data(), in.data() + in.size(), out);
 }
 
 template <typename Char, typename OutputIt,
@@ -396,7 +393,7 @@ auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
     -> OutputIt {
   codecvt_result<Char> unit;
   write_codecvt(unit, sv, loc);
-  return copy<Char>(unit.buf, unit.end, out);
+  return copy_str<Char>(unit.buf, unit.end, out);
 }
 
 template <typename Char, typename OutputIt,
@@ -444,56 +441,16 @@ struct is_same_arithmetic_type
                                          std::is_floating_point<Rep2>::value)> {
 };
 
-FMT_NORETURN inline void throw_duration_error() {
-  FMT_THROW(format_error("cannot format duration"));
-}
-
-// Cast one integral duration to another with an overflow check.
-template <typename To, typename FromRep, typename FromPeriod,
-          FMT_ENABLE_IF(std::is_integral<FromRep>::value&&
-                            std::is_integral<typename To::rep>::value)>
-auto duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
-#if !FMT_SAFE_DURATION_CAST
-  return std::chrono::duration_cast<To>(from);
-#else
-  // The conversion factor: to.count() == factor * from.count().
-  using factor = std::ratio_divide<FromPeriod, typename To::period>;
-
-  using common_rep = typename std::common_type<FromRep, typename To::rep,
-                                               decltype(factor::num)>::type;
-
-  int ec = 0;
-  auto count = safe_duration_cast::lossless_integral_conversion<common_rep>(
-      from.count(), ec);
-  if (ec) throw_duration_error();
-
-  // Multiply from.count() by factor and check for overflow.
-  if (const_check(factor::num != 1)) {
-    if (count > max_value<common_rep>() / factor::num) throw_duration_error();
-    const auto min = (std::numeric_limits<common_rep>::min)() / factor::num;
-    if (const_check(!std::is_unsigned<common_rep>::value) && count < min)
-      throw_duration_error();
-    count *= factor::num;
-  }
-  if (const_check(factor::den != 1)) count /= factor::den;
-  auto to =
-      To(safe_duration_cast::lossless_integral_conversion<typename To::rep>(
-          count, ec));
-  if (ec) throw_duration_error();
-  return to;
-#endif
-}
-
-template <typename To, typename FromRep, typename FromPeriod,
-          FMT_ENABLE_IF(std::is_floating_point<FromRep>::value&&
-                            std::is_floating_point<typename To::rep>::value)>
-auto duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
 #if FMT_SAFE_DURATION_CAST
   // Throwing version of safe_duration_cast is only available for
   // integer to integer or float to float casts.
   int ec;
   To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
-  if (ec) throw_duration_error();
+  if (ec) FMT_THROW(format_error("cannot format duration"));
   return to;
 #else
   // Standard duration cast, may overflow.
@@ -504,60 +461,54 @@ auto duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
 template <
     typename To, typename FromRep, typename FromPeriod,
     FMT_ENABLE_IF(!is_same_arithmetic_type<FromRep, typename To::rep>::value)>
-auto duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
   // Mixed integer <-> float cast is not supported by safe_duration_cast.
   return std::chrono::duration_cast<To>(from);
 }
 
 template <typename Duration>
-auto to_time_t(sys_time<Duration> time_point) -> std::time_t {
+auto to_time_t(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::time_t {
   // Cannot use std::chrono::system_clock::to_time_t since this would first
   // require a cast to std::chrono::system_clock::time_point, which could
   // overflow.
-  return detail::duration_cast<std::chrono::duration<std::time_t>>(
+  return fmt_duration_cast<std::chrono::duration<std::time_t>>(
              time_point.time_since_epoch())
       .count();
 }
-
-// Workaround a bug in libstdc++ which sets __cpp_lib_chrono to 201907 without
-// providing current_zone(): https://github.com/fmtlib/fmt/issues/4160.
-template <typename T> FMT_CONSTEXPR auto has_current_zone() -> bool {
-  using namespace std::chrono;
-  using namespace fmt_detail;
-  return !std::is_same<decltype(current_zone()), fmt_detail::time_zone*>::value;
-}
 }  // namespace detail
 
 FMT_BEGIN_EXPORT
 
 /**
- * Converts given time since epoch as `std::time_t` value into calendar time,
- * expressed in local time. Unlike `std::localtime`, this function is
- * thread-safe on most platforms.
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in local time. Unlike ``std::localtime``, this function is
+  thread-safe on most platforms.
  */
 inline auto localtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
-    inline dispatcher(std::time_t t) : time_(t) {}
+    dispatcher(std::time_t t) : time_(t) {}
 
-    inline auto run() -> bool {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(localtime_r(&time_, &tm_));
     }
 
-    inline auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    inline auto handle(detail::null<>) -> bool {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(localtime_s(&tm_, &time_));
     }
 
-    inline auto fallback(int res) -> bool { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    inline auto fallback(detail::null<>) -> bool {
+    auto fallback(detail::null<>) -> bool {
       using namespace fmt::detail;
       std::tm* tm = std::localtime(&time_);
       if (tm) tm_ = *tm;
@@ -572,43 +523,41 @@ inline auto localtime(std::time_t time) -> std::tm {
 }
 
 #if FMT_USE_LOCAL_TIME
-template <typename Duration,
-          FMT_ENABLE_IF(detail::has_current_zone<Duration>())>
+template <typename Duration>
 inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
-  using namespace std::chrono;
-  using namespace fmt_detail;
-  return localtime(detail::to_time_t(current_zone()->to_sys<Duration>(time)));
+  return localtime(
+      detail::to_time_t(std::chrono::current_zone()->to_sys(time)));
 }
 #endif
 
 /**
- * Converts given time since epoch as `std::time_t` value into calendar time,
- * expressed in Coordinated Universal Time (UTC). Unlike `std::gmtime`, this
- * function is thread-safe on most platforms.
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this
+  function is thread-safe on most platforms.
  */
 inline auto gmtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
-    inline dispatcher(std::time_t t) : time_(t) {}
+    dispatcher(std::time_t t) : time_(t) {}
 
-    inline auto run() -> bool {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(gmtime_r(&time_, &tm_));
     }
 
-    inline auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    inline auto handle(detail::null<>) -> bool {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(gmtime_s(&tm_, &time_));
     }
 
-    inline auto fallback(int res) -> bool { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    inline auto fallback(detail::null<>) -> bool {
+    auto fallback(detail::null<>) -> bool {
       std::tm* tm = std::gmtime(&time_);
       if (tm) tm_ = *tm;
       return tm != nullptr;
@@ -622,7 +571,9 @@ inline auto gmtime(std::time_t time) -> std::tm {
 }
 
 template <typename Duration>
-inline auto gmtime(sys_time<Duration> time_point) -> std::tm {
+inline auto gmtime(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::tm {
   return gmtime(detail::to_time_t(time_point));
 }
 
@@ -668,8 +619,7 @@ FMT_CONSTEXPR inline auto get_units() -> const char* {
   if (std::is_same<Period, std::femto>::value) return "fs";
   if (std::is_same<Period, std::pico>::value) return "ps";
   if (std::is_same<Period, std::nano>::value) return "ns";
-  if (std::is_same<Period, std::micro>::value)
-    return detail::use_utf8 ? "µs" : "us";
+  if (std::is_same<Period, std::micro>::value) return "µs";
   if (std::is_same<Period, std::milli>::value) return "ms";
   if (std::is_same<Period, std::centi>::value) return "cs";
   if (std::is_same<Period, std::deci>::value) return "ds";
@@ -696,10 +646,12 @@ enum class numeric_system {
 
 // Glibc extensions for formatting numeric values.
 enum class pad_type {
-  // Pad a numeric result string with zeros (the default).
-  zero,
+  unspecified,
   // Do not pad a numeric result string.
   none,
+  // Pad a numeric result string with zeros even if the conversion specifier
+  // character uses space-padding by default.
+  zero,
   // Pad a numeric result string with spaces.
   space,
 };
@@ -707,7 +659,7 @@ enum class pad_type {
 template <typename OutputIt>
 auto write_padding(OutputIt out, pad_type pad, int width) -> OutputIt {
   if (pad == pad_type::none) return out;
-  return detail::fill_n(out, width, pad == pad_type::space ? ' ' : '0');
+  return std::fill_n(out, width, pad == pad_type::space ? ' ' : '0');
 }
 
 template <typename OutputIt>
@@ -723,8 +675,8 @@ FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
   if (begin == end || *begin == '}') return begin;
   if (*begin != '%') FMT_THROW(format_error("invalid format"));
   auto ptr = begin;
+  pad_type pad = pad_type::unspecified;
   while (ptr != end) {
-    pad_type pad = pad_type::zero;
     auto c = *ptr;
     if (c == '}') break;
     if (c != '%') {
@@ -744,11 +696,17 @@ FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
       pad = pad_type::none;
       ++ptr;
       break;
+    case '0':
+      pad = pad_type::zero;
+      ++ptr;
+      break;
     }
     if (ptr == end) FMT_THROW(format_error("invalid format"));
     c = *ptr++;
     switch (c) {
-    case '%': handler.on_text(ptr - 1, ptr); break;
+    case '%':
+      handler.on_text(ptr - 1, ptr);
+      break;
     case 'n': {
       const Char newline[] = {'\n'};
       handler.on_text(newline, newline + 1);
@@ -760,66 +718,145 @@ FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
       break;
     }
     // Year:
-    case 'Y': handler.on_year(numeric_system::standard, pad); break;
-    case 'y': handler.on_short_year(numeric_system::standard); break;
-    case 'C': handler.on_century(numeric_system::standard); break;
-    case 'G': handler.on_iso_week_based_year(); break;
-    case 'g': handler.on_iso_week_based_short_year(); break;
+    case 'Y':
+      handler.on_year(numeric_system::standard);
+      break;
+    case 'y':
+      handler.on_short_year(numeric_system::standard);
+      break;
+    case 'C':
+      handler.on_century(numeric_system::standard);
+      break;
+    case 'G':
+      handler.on_iso_week_based_year();
+      break;
+    case 'g':
+      handler.on_iso_week_based_short_year();
+      break;
     // Day of the week:
-    case 'a': handler.on_abbr_weekday(); break;
-    case 'A': handler.on_full_weekday(); break;
-    case 'w': handler.on_dec0_weekday(numeric_system::standard); break;
-    case 'u': handler.on_dec1_weekday(numeric_system::standard); break;
+    case 'a':
+      handler.on_abbr_weekday();
+      break;
+    case 'A':
+      handler.on_full_weekday();
+      break;
+    case 'w':
+      handler.on_dec0_weekday(numeric_system::standard);
+      break;
+    case 'u':
+      handler.on_dec1_weekday(numeric_system::standard);
+      break;
     // Month:
     case 'b':
-    case 'h': handler.on_abbr_month(); break;
-    case 'B': handler.on_full_month(); break;
-    case 'm': handler.on_dec_month(numeric_system::standard, pad); break;
+    case 'h':
+      handler.on_abbr_month();
+      break;
+    case 'B':
+      handler.on_full_month();
+      break;
+    case 'm':
+      handler.on_dec_month(numeric_system::standard);
+      break;
     // Day of the year/month:
     case 'U':
-      handler.on_dec0_week_of_year(numeric_system::standard, pad);
+      handler.on_dec0_week_of_year(numeric_system::standard);
       break;
     case 'W':
-      handler.on_dec1_week_of_year(numeric_system::standard, pad);
+      handler.on_dec1_week_of_year(numeric_system::standard);
+      break;
+    case 'V':
+      handler.on_iso_week_of_year(numeric_system::standard);
+      break;
+    case 'j':
+      handler.on_day_of_year();
+      break;
+    case 'd':
+      handler.on_day_of_month(numeric_system::standard);
       break;
-    case 'V': handler.on_iso_week_of_year(numeric_system::standard, pad); break;
-    case 'j': handler.on_day_of_year(pad); break;
-    case 'd': handler.on_day_of_month(numeric_system::standard, pad); break;
     case 'e':
-      handler.on_day_of_month(numeric_system::standard, pad_type::space);
+      handler.on_day_of_month_space(numeric_system::standard);
       break;
     // Hour, minute, second:
-    case 'H': handler.on_24_hour(numeric_system::standard, pad); break;
-    case 'I': handler.on_12_hour(numeric_system::standard, pad); break;
-    case 'M': handler.on_minute(numeric_system::standard, pad); break;
-    case 'S': handler.on_second(numeric_system::standard, pad); break;
+    case 'H':
+      handler.on_24_hour(numeric_system::standard, pad);
+      break;
+    case 'I':
+      handler.on_12_hour(numeric_system::standard, pad);
+      break;
+    case 'M':
+      handler.on_minute(numeric_system::standard, pad);
+      break;
+    case 'S':
+      handler.on_second(numeric_system::standard, pad);
+      break;
     // Other:
-    case 'c': handler.on_datetime(numeric_system::standard); break;
-    case 'x': handler.on_loc_date(numeric_system::standard); break;
-    case 'X': handler.on_loc_time(numeric_system::standard); break;
-    case 'D': handler.on_us_date(); break;
-    case 'F': handler.on_iso_date(); break;
-    case 'r': handler.on_12_hour_time(); break;
-    case 'R': handler.on_24_hour_time(); break;
-    case 'T': handler.on_iso_time(); break;
-    case 'p': handler.on_am_pm(); break;
-    case 'Q': handler.on_duration_value(); break;
-    case 'q': handler.on_duration_unit(); break;
-    case 'z': handler.on_utc_offset(numeric_system::standard); break;
-    case 'Z': handler.on_tz_name(); break;
+    case 'c':
+      handler.on_datetime(numeric_system::standard);
+      break;
+    case 'x':
+      handler.on_loc_date(numeric_system::standard);
+      break;
+    case 'X':
+      handler.on_loc_time(numeric_system::standard);
+      break;
+    case 'D':
+      handler.on_us_date();
+      break;
+    case 'F':
+      handler.on_iso_date();
+      break;
+    case 'r':
+      handler.on_12_hour_time();
+      break;
+    case 'R':
+      handler.on_24_hour_time();
+      break;
+    case 'T':
+      handler.on_iso_time();
+      break;
+    case 'p':
+      handler.on_am_pm();
+      break;
+    case 'Q':
+      handler.on_duration_value();
+      break;
+    case 'q':
+      handler.on_duration_unit();
+      break;
+    case 'z':
+      handler.on_utc_offset(numeric_system::standard);
+      break;
+    case 'Z':
+      handler.on_tz_name();
+      break;
     // Alternative representation:
     case 'E': {
       if (ptr == end) FMT_THROW(format_error("invalid format"));
       c = *ptr++;
       switch (c) {
-      case 'Y': handler.on_year(numeric_system::alternative, pad); break;
-      case 'y': handler.on_offset_year(); break;
-      case 'C': handler.on_century(numeric_system::alternative); break;
-      case 'c': handler.on_datetime(numeric_system::alternative); break;
-      case 'x': handler.on_loc_date(numeric_system::alternative); break;
-      case 'X': handler.on_loc_time(numeric_system::alternative); break;
-      case 'z': handler.on_utc_offset(numeric_system::alternative); break;
-      default:  FMT_THROW(format_error("invalid format"));
+      case 'Y':
+        handler.on_year(numeric_system::alternative);
+        break;
+      case 'y':
+        handler.on_offset_year();
+        break;
+      case 'C':
+        handler.on_century(numeric_system::alternative);
+        break;
+      case 'c':
+        handler.on_datetime(numeric_system::alternative);
+        break;
+      case 'x':
+        handler.on_loc_date(numeric_system::alternative);
+        break;
+      case 'X':
+        handler.on_loc_time(numeric_system::alternative);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
       }
       break;
     }
@@ -827,34 +864,54 @@ FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
       if (ptr == end) FMT_THROW(format_error("invalid format"));
       c = *ptr++;
       switch (c) {
-      case 'y': handler.on_short_year(numeric_system::alternative); break;
-      case 'm': handler.on_dec_month(numeric_system::alternative, pad); break;
+      case 'y':
+        handler.on_short_year(numeric_system::alternative);
+        break;
+      case 'm':
+        handler.on_dec_month(numeric_system::alternative);
+        break;
       case 'U':
-        handler.on_dec0_week_of_year(numeric_system::alternative, pad);
+        handler.on_dec0_week_of_year(numeric_system::alternative);
         break;
       case 'W':
-        handler.on_dec1_week_of_year(numeric_system::alternative, pad);
+        handler.on_dec1_week_of_year(numeric_system::alternative);
         break;
       case 'V':
-        handler.on_iso_week_of_year(numeric_system::alternative, pad);
+        handler.on_iso_week_of_year(numeric_system::alternative);
         break;
       case 'd':
-        handler.on_day_of_month(numeric_system::alternative, pad);
+        handler.on_day_of_month(numeric_system::alternative);
         break;
       case 'e':
-        handler.on_day_of_month(numeric_system::alternative, pad_type::space);
+        handler.on_day_of_month_space(numeric_system::alternative);
         break;
-      case 'w': handler.on_dec0_weekday(numeric_system::alternative); break;
-      case 'u': handler.on_dec1_weekday(numeric_system::alternative); break;
-      case 'H': handler.on_24_hour(numeric_system::alternative, pad); break;
-      case 'I': handler.on_12_hour(numeric_system::alternative, pad); break;
-      case 'M': handler.on_minute(numeric_system::alternative, pad); break;
-      case 'S': handler.on_second(numeric_system::alternative, pad); break;
-      case 'z': handler.on_utc_offset(numeric_system::alternative); break;
-      default:  FMT_THROW(format_error("invalid format"));
+      case 'w':
+        handler.on_dec0_weekday(numeric_system::alternative);
+        break;
+      case 'u':
+        handler.on_dec1_weekday(numeric_system::alternative);
+        break;
+      case 'H':
+        handler.on_24_hour(numeric_system::alternative, pad);
+        break;
+      case 'I':
+        handler.on_12_hour(numeric_system::alternative, pad);
+        break;
+      case 'M':
+        handler.on_minute(numeric_system::alternative, pad);
+        break;
+      case 'S':
+        handler.on_second(numeric_system::alternative, pad);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
       }
       break;
-    default: FMT_THROW(format_error("invalid format"));
+    default:
+      FMT_THROW(format_error("invalid format"));
     }
     begin = ptr;
   }
@@ -866,7 +923,7 @@ template <typename Derived> struct null_chrono_spec_handler {
   FMT_CONSTEXPR void unsupported() {
     static_cast<Derived*>(this)->unsupported();
   }
-  FMT_CONSTEXPR void on_year(numeric_system, pad_type) { unsupported(); }
+  FMT_CONSTEXPR void on_year(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_short_year(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_offset_year() { unsupported(); }
   FMT_CONSTEXPR void on_century(numeric_system) { unsupported(); }
@@ -878,20 +935,13 @@ template <typename Derived> struct null_chrono_spec_handler {
   FMT_CONSTEXPR void on_dec1_weekday(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_abbr_month() { unsupported(); }
   FMT_CONSTEXPR void on_full_month() { unsupported(); }
-  FMT_CONSTEXPR void on_dec_month(numeric_system, pad_type) { unsupported(); }
-  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {
-    unsupported();
-  }
-  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {
-    unsupported();
-  }
-  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {
-    unsupported();
-  }
-  FMT_CONSTEXPR void on_day_of_year(pad_type) { unsupported(); }
-  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {
-    unsupported();
-  }
+  FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_year() { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); }
@@ -912,13 +962,11 @@ template <typename Derived> struct null_chrono_spec_handler {
 };
 
 struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
-  FMT_NORETURN inline void unsupported() {
-    FMT_THROW(format_error("no format"));
-  }
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no format")); }
 
   template <typename Char>
   FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-  FMT_CONSTEXPR void on_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_year(numeric_system) {}
   FMT_CONSTEXPR void on_short_year(numeric_system) {}
   FMT_CONSTEXPR void on_offset_year() {}
   FMT_CONSTEXPR void on_century(numeric_system) {}
@@ -930,12 +978,13 @@ struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
   FMT_CONSTEXPR void on_dec1_weekday(numeric_system) {}
   FMT_CONSTEXPR void on_abbr_month() {}
   FMT_CONSTEXPR void on_full_month() {}
-  FMT_CONSTEXPR void on_dec_month(numeric_system, pad_type) {}
-  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {}
-  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {}
-  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {}
-  FMT_CONSTEXPR void on_day_of_year(pad_type) {}
-  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_dec_month(numeric_system) {}
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) {}
   FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
@@ -991,14 +1040,15 @@ template <typename T>
 struct has_member_data_tm_zone<T, void_t<decltype(T::tm_zone)>>
     : std::true_type {};
 
+#if FMT_USE_TZSET
 inline void tzset_once() {
-  static bool init = []() {
-    using namespace fmt_detail;
+  static bool init = []() -> bool {
     _tzset();
-    return false;
+    return true;
   }();
   ignore_unused(init);
 }
+#endif
 
 // Converts value to Int and checks that it's in the range [0, upper).
 template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
@@ -1011,10 +1061,9 @@ inline auto to_nonnegative_int(T value, Int upper) -> Int {
 }
 template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
 inline auto to_nonnegative_int(T value, Int upper) -> Int {
-  auto int_value = static_cast<Int>(value);
-  if (int_value < 0 || value > static_cast<T>(upper))
+  if (value < 0 || value > static_cast<T>(upper))
     FMT_THROW(format_error("invalid value"));
-  return int_value;
+  return static_cast<Int>(value);
 }
 
 constexpr auto pow10(std::uint32_t n) -> long long {
@@ -1049,16 +1098,16 @@ void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
   using subsecond_precision = std::chrono::duration<
       typename std::common_type<typename Duration::rep,
                                 std::chrono::seconds::rep>::type,
-      std::ratio<1, pow10(num_fractional_digits)>>;
+      std::ratio<1, detail::pow10(num_fractional_digits)>>;
 
-  const auto fractional = d - detail::duration_cast<std::chrono::seconds>(d);
+  const auto fractional = d - fmt_duration_cast<std::chrono::seconds>(d);
   const auto subseconds =
       std::chrono::treat_as_floating_point<
           typename subsecond_precision::rep>::value
           ? fractional.count()
-          : detail::duration_cast<subsecond_precision>(fractional).count();
+          : fmt_duration_cast<subsecond_precision>(fractional).count();
   auto n = static_cast<uint32_or_64_or_128_t<long long>>(subseconds);
-  const int num_digits = count_digits(n);
+  const int num_digits = detail::count_digits(n);
 
   int leading_zeroes = (std::max)(0, num_fractional_digits - num_digits);
   if (precision < 0) {
@@ -1066,25 +1115,22 @@ void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
     if (std::ratio_less<typename subsecond_precision::period,
                         std::chrono::seconds::period>::value) {
       *out++ = '.';
-      out = detail::fill_n(out, leading_zeroes, '0');
-      out = format_decimal<Char>(out, n, num_digits);
+      out = std::fill_n(out, leading_zeroes, '0');
+      out = format_decimal<Char>(out, n, num_digits).end;
     }
-  } else if (precision > 0) {
+  } else {
     *out++ = '.';
-    leading_zeroes = min_of(leading_zeroes, precision);
+    leading_zeroes = (std::min)(leading_zeroes, precision);
+    out = std::fill_n(out, leading_zeroes, '0');
     int remaining = precision - leading_zeroes;
-    out = detail::fill_n(out, leading_zeroes, '0');
-    if (remaining < num_digits) {
-      int num_truncated_digits = num_digits - remaining;
-      n /= to_unsigned(pow10(to_unsigned(num_truncated_digits)));
-      if (n != 0) out = format_decimal<Char>(out, n, remaining);
+    if (remaining != 0 && remaining < num_digits) {
+      n /= to_unsigned(detail::pow10(to_unsigned(num_digits - remaining)));
+      out = format_decimal<Char>(out, n, remaining).end;
       return;
     }
-    if (n != 0) {
-      out = format_decimal<Char>(out, n, num_digits);
-      remaining -= num_digits;
-    }
-    out = detail::fill_n(out, remaining, '0');
+    out = format_decimal<Char>(out, n, num_digits).end;
+    remaining -= num_digits;
+    out = std::fill_n(out, remaining, '0');
   }
 }
 
@@ -1225,28 +1271,29 @@ class tm_writer {
     }
   }
 
-  void write_year_extended(long long year, pad_type pad) {
+  void write_year_extended(long long year) {
     // At least 4 characters.
     int width = 4;
-    bool negative = year < 0;
-    if (negative) {
+    if (year < 0) {
+      *out_++ = '-';
       year = 0 - year;
       --width;
     }
     uint32_or_64_or_128_t<long long> n = to_unsigned(year);
     const int num_digits = count_digits(n);
-    if (negative && pad == pad_type::zero) *out_++ = '-';
-    if (width > num_digits) {
-      out_ = detail::write_padding(out_, pad, width - num_digits);
-    }
-    if (negative && pad != pad_type::zero) *out_++ = '-';
-    out_ = format_decimal<Char>(out_, n, num_digits);
+    if (width > num_digits) out_ = std::fill_n(out_, width - num_digits, '0');
+    out_ = format_decimal<Char>(out_, n, num_digits).end;
   }
-  void write_year(long long year, pad_type pad) {
-    write_year_extended(year, pad);
+  void write_year(long long year) {
+    if (year >= 0 && year < 10000) {
+      write2(static_cast<int>(year / 100));
+      write2(static_cast<int>(year % 100));
+    } else {
+      write_year_extended(year);
+    }
   }
 
-  void write_utc_offset(long long offset, numeric_system ns) {
+  void write_utc_offset(long offset, numeric_system ns) {
     if (offset < 0) {
       *out_++ = '-';
       offset = -offset;
@@ -1258,7 +1305,6 @@ class tm_writer {
     if (ns != numeric_system::standard) *out_++ = ':';
     write2(static_cast<int>(offset % 60));
   }
-
   template <typename T, FMT_ENABLE_IF(has_member_data_tm_gmtoff<T>::value)>
   void format_utc_offset_impl(const T& tm, numeric_system ns) {
     write_utc_offset(tm.tm_gmtoff, ns);
@@ -1266,7 +1312,9 @@ class tm_writer {
   template <typename T, FMT_ENABLE_IF(!has_member_data_tm_gmtoff<T>::value)>
   void format_utc_offset_impl(const T& tm, numeric_system ns) {
 #if defined(_WIN32) && defined(_UCRT)
+#  if FMT_USE_TZSET
     tzset_once();
+#  endif
     long offset = 0;
     _get_timezone(&offset);
     if (tm.tm_isdst) {
@@ -1283,7 +1331,7 @@ class tm_writer {
     std::time_t gt = std::mktime(&gtm);
     std::tm ltm = gmtime(gt);
     std::time_t lt = std::mktime(&ltm);
-    long long offset = gt - lt;
+    long offset = gt - lt;
     write_utc_offset(offset, ns);
 #endif
   }
@@ -1316,7 +1364,7 @@ class tm_writer {
   auto out() const -> OutputIt { return out_; }
 
   FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    out_ = copy<Char>(begin, end, out_);
+    out_ = copy_str<Char>(begin, end, out_);
   }
 
   void on_abbr_weekday() {
@@ -1363,11 +1411,11 @@ class tm_writer {
       *out_++ = ' ';
       on_abbr_month();
       *out_++ = ' ';
-      on_day_of_month(numeric_system::standard, pad_type::space);
+      on_day_of_month_space(numeric_system::standard);
       *out_++ = ' ';
       on_iso_time();
       *out_++ = ' ';
-      on_year(numeric_system::standard, pad_type::space);
+      on_year(numeric_system::standard);
     } else {
       format_localized('c', ns == numeric_system::standard ? '\0' : 'E');
     }
@@ -1389,31 +1437,31 @@ class tm_writer {
     write_digit2_separated(buf, to_unsigned(tm_mon() + 1),
                            to_unsigned(tm_mday()),
                            to_unsigned(split_year_lower(tm_year())), '/');
-    out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
+    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
   }
   void on_iso_date() {
     auto year = tm_year();
     char buf[10];
     size_t offset = 0;
     if (year >= 0 && year < 10000) {
-      write2digits(buf, static_cast<size_t>(year / 100));
+      copy2(buf, digits2(static_cast<size_t>(year / 100)));
     } else {
       offset = 4;
-      write_year_extended(year, pad_type::zero);
+      write_year_extended(year);
       year = 0;
     }
     write_digit2_separated(buf + 2, static_cast<unsigned>(year % 100),
                            to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()),
                            '-');
-    out_ = copy<Char>(std::begin(buf) + offset, std::end(buf), out_);
+    out_ = copy_str<Char>(std::begin(buf) + offset, std::end(buf), out_);
   }
 
   void on_utc_offset(numeric_system ns) { format_utc_offset_impl(tm_, ns); }
   void on_tz_name() { format_tz_name_impl(tm_); }
 
-  void on_year(numeric_system ns, pad_type pad) {
+  void on_year(numeric_system ns) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write_year(tm_year(), pad);
+      return write_year(tm_year());
     format_localized('Y', 'E');
   }
   void on_short_year(numeric_system ns) {
@@ -1444,58 +1492,57 @@ class tm_writer {
     }
   }
 
-  void on_dec_month(numeric_system ns, pad_type pad) {
+  void on_dec_month(numeric_system ns) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2(tm_mon() + 1, pad);
+      return write2(tm_mon() + 1);
     format_localized('m', 'O');
   }
 
-  void on_dec0_week_of_year(numeric_system ns, pad_type pad) {
+  void on_dec0_week_of_year(numeric_system ns) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week,
-                    pad);
+      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week);
     format_localized('U', 'O');
   }
-  void on_dec1_week_of_year(numeric_system ns, pad_type pad) {
+  void on_dec1_week_of_year(numeric_system ns) {
     if (is_classic_ || ns == numeric_system::standard) {
       auto wday = tm_wday();
       write2((tm_yday() + days_per_week -
               (wday == 0 ? (days_per_week - 1) : (wday - 1))) /
-                 days_per_week,
-             pad);
+             days_per_week);
     } else {
       format_localized('W', 'O');
     }
   }
-  void on_iso_week_of_year(numeric_system ns, pad_type pad) {
+  void on_iso_week_of_year(numeric_system ns) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2(tm_iso_week_of_year(), pad);
+      return write2(tm_iso_week_of_year());
     format_localized('V', 'O');
   }
 
-  void on_iso_week_based_year() {
-    write_year(tm_iso_week_year(), pad_type::zero);
-  }
+  void on_iso_week_based_year() { write_year(tm_iso_week_year()); }
   void on_iso_week_based_short_year() {
     write2(split_year_lower(tm_iso_week_year()));
   }
 
-  void on_day_of_year(pad_type pad) {
+  void on_day_of_year() {
     auto yday = tm_yday() + 1;
-    auto digit1 = yday / 100;
-    if (digit1 != 0) {
-      write1(digit1);
-    } else {
-      out_ = detail::write_padding(out_, pad);
-    }
-    write2(yday % 100, pad);
+    write1(yday / 100);
+    write2(yday % 100);
   }
-
-  void on_day_of_month(numeric_system ns, pad_type pad) {
-    if (is_classic_ || ns == numeric_system::standard)
-      return write2(tm_mday(), pad);
+  void on_day_of_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_mday());
     format_localized('d', 'O');
   }
+  void on_day_of_month_space(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto mday = to_unsigned(tm_mday()) % 100;
+      const char* d2 = digits2(mday);
+      *out_++ = mday < 10 ? ' ' : d2[0];
+      *out_++ = d2[1];
+    } else {
+      format_localized('e', 'O');
+    }
+  }
 
   void on_24_hour(numeric_system ns, pad_type pad) {
     if (is_classic_ || ns == numeric_system::standard)
@@ -1522,7 +1569,7 @@ class tm_writer {
           write_floating_seconds(buf, *subsecs_);
           if (buf.size() > 1) {
             // Remove the leading "0", write something like ".123".
-            out_ = copy<Char>(buf.begin() + 1, buf.end(), out_);
+            out_ = std::copy(buf.begin() + 1, buf.end(), out_);
           }
         } else {
           write_fractional_seconds<Char>(out_, *subsecs_);
@@ -1539,7 +1586,7 @@ class tm_writer {
       char buf[8];
       write_digit2_separated(buf, to_unsigned(tm_hour12()),
                              to_unsigned(tm_min()), to_unsigned(tm_sec()), ':');
-      out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
+      out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
       *out_++ = ' ';
       on_am_pm();
     } else {
@@ -1554,7 +1601,7 @@ class tm_writer {
   void on_iso_time() {
     on_24_hour_time();
     *out_++ = ':';
-    on_second(numeric_system::standard, pad_type::zero);
+    on_second(numeric_system::standard, pad_type::unspecified);
   }
 
   void on_am_pm() {
@@ -1574,11 +1621,11 @@ class tm_writer {
 struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
   bool has_precision_integral = false;
 
-  FMT_NORETURN inline void unsupported() { FMT_THROW(format_error("no date")); }
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); }
 
   template <typename Char>
   FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-  FMT_CONSTEXPR void on_day_of_year(pad_type) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
   FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
@@ -1588,8 +1635,9 @@ struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
   FMT_CONSTEXPR void on_iso_time() {}
   FMT_CONSTEXPR void on_am_pm() {}
   FMT_CONSTEXPR void on_duration_value() const {
-    if (has_precision_integral)
+    if (has_precision_integral) {
       FMT_THROW(format_error("precision not allowed for this argument type"));
+    }
   }
   FMT_CONSTEXPR void on_duration_unit() {}
 };
@@ -1629,17 +1677,17 @@ inline auto get_milliseconds(std::chrono::duration<Rep, Period> d)
 #if FMT_SAFE_DURATION_CAST
   using CommonSecondsType =
       typename std::common_type<decltype(d), std::chrono::seconds>::type;
-  const auto d_as_common = detail::duration_cast<CommonSecondsType>(d);
+  const auto d_as_common = fmt_duration_cast<CommonSecondsType>(d);
   const auto d_as_whole_seconds =
-      detail::duration_cast<std::chrono::seconds>(d_as_common);
+      fmt_duration_cast<std::chrono::seconds>(d_as_common);
   // this conversion should be nonproblematic
   const auto diff = d_as_common - d_as_whole_seconds;
   const auto ms =
-      detail::duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
+      fmt_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
   return ms;
 #else
-  auto s = detail::duration_cast<std::chrono::seconds>(d);
-  return detail::duration_cast<std::chrono::milliseconds>(d - s);
+  auto s = fmt_duration_cast<std::chrono::seconds>(d);
+  return fmt_duration_cast<std::chrono::milliseconds>(d - s);
 #endif
 }
 
@@ -1652,16 +1700,16 @@ auto format_duration_value(OutputIt out, Rep val, int) -> OutputIt {
 template <typename Char, typename Rep, typename OutputIt,
           FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
 auto format_duration_value(OutputIt out, Rep val, int precision) -> OutputIt {
-  auto specs = format_specs();
+  auto specs = format_specs<Char>();
   specs.precision = precision;
-  specs.set_type(precision >= 0 ? presentation_type::fixed
-                                : presentation_type::general);
+  specs.type = precision >= 0 ? presentation_type::fixed_lower
+                              : presentation_type::general_lower;
   return write<Char>(out, val, specs);
 }
 
 template <typename Char, typename OutputIt>
 auto copy_unit(string_view unit, OutputIt out, Char) -> OutputIt {
-  return copy<Char>(unit.begin(), unit.end(), out);
+  return std::copy(unit.begin(), unit.end(), out);
 }
 
 template <typename OutputIt>
@@ -1669,7 +1717,7 @@ auto copy_unit(string_view unit, OutputIt out, wchar_t) -> OutputIt {
   // This works when wchar_t is UTF-32 because units only contain characters
   // that have the same representation in UTF-16 and UTF-32.
   utf8_to_utf16 u(unit);
-  return copy<wchar_t>(u.c_str(), u.c_str() + u.size(), out);
+  return std::copy(u.c_str(), u.c_str() + u.size(), out);
 }
 
 template <typename Char, typename Period, typename OutputIt>
@@ -1695,14 +1743,14 @@ class get_locale {
   bool has_locale_ = false;
 
  public:
-  inline get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
+  get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
     if (localized)
       ::new (&locale_) std::locale(loc.template get<std::locale>());
   }
-  inline ~get_locale() {
+  ~get_locale() {
     if (has_locale_) locale_.~locale();
   }
-  inline operator const std::locale&() const {
+  operator const std::locale&() const {
     return has_locale_ ? locale_ : get_classic_locale();
   }
 };
@@ -1741,7 +1789,7 @@ struct chrono_formatter {
     // this may overflow and/or the result may not fit in the
     // target type.
     // might need checked conversion (rep!=Rep)
-    s = detail::duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
+    s = fmt_duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
   }
 
   // returns true if nan or inf, writes to out.
@@ -1792,7 +1840,7 @@ struct chrono_formatter {
     }
   }
 
-  void write(Rep value, int width, pad_type pad = pad_type::zero) {
+  void write(Rep value, int width, pad_type pad = pad_type::unspecified) {
     write_sign();
     if (isnan(value)) return write_nan();
     uint32_or_64_or_128_t<int> n =
@@ -1801,7 +1849,7 @@ struct chrono_formatter {
     if (width > num_digits) {
       out = detail::write_padding(out, pad, width - num_digits);
     }
-    out = format_decimal<char_type>(out, n, num_digits);
+    out = format_decimal<char_type>(out, n, num_digits).end;
   }
 
   void write_nan() { std::copy_n("nan", 3, out); }
@@ -1818,7 +1866,7 @@ struct chrono_formatter {
   }
 
   void on_text(const char_type* begin, const char_type* end) {
-    copy<char_type>(begin, end, out);
+    std::copy(begin, end, out);
   }
 
   // These are not implemented because durations don't have date information.
@@ -1835,19 +1883,20 @@ struct chrono_formatter {
   void on_iso_date() {}
   void on_utc_offset(numeric_system) {}
   void on_tz_name() {}
-  void on_year(numeric_system, pad_type) {}
+  void on_year(numeric_system) {}
   void on_short_year(numeric_system) {}
   void on_offset_year() {}
   void on_century(numeric_system) {}
   void on_iso_week_based_year() {}
   void on_iso_week_based_short_year() {}
-  void on_dec_month(numeric_system, pad_type) {}
-  void on_dec0_week_of_year(numeric_system, pad_type) {}
-  void on_dec1_week_of_year(numeric_system, pad_type) {}
-  void on_iso_week_of_year(numeric_system, pad_type) {}
-  void on_day_of_month(numeric_system, pad_type) {}
+  void on_dec_month(numeric_system) {}
+  void on_dec0_week_of_year(numeric_system) {}
+  void on_dec1_week_of_year(numeric_system) {}
+  void on_iso_week_of_year(numeric_system) {}
+  void on_day_of_month(numeric_system) {}
+  void on_day_of_month_space(numeric_system) {}
 
-  void on_day_of_year(pad_type) {
+  void on_day_of_year() {
     if (handle_nan_inf()) return;
     write(days(), 0);
   }
@@ -1891,7 +1940,7 @@ struct chrono_formatter {
         if (buf.size() < 2 || buf[1] == '.') {
           out = detail::write_padding(out, pad);
         }
-        out = copy<char_type>(buf.begin(), buf.end(), out);
+        out = std::copy(buf.begin(), buf.end(), out);
       } else {
         write(second(), 2, pad);
         write_fractional_seconds<char_type>(
@@ -1925,7 +1974,7 @@ struct chrono_formatter {
     on_24_hour_time();
     *out++ = ':';
     if (handle_nan_inf()) return;
-    on_second(numeric_system::standard, pad_type::zero);
+    on_second(numeric_system::standard, pad_type::unspecified);
   }
 
   void on_am_pm() {
@@ -1948,240 +1997,82 @@ struct chrono_formatter {
 
 #if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907
 using weekday = std::chrono::weekday;
-using day = std::chrono::day;
-using month = std::chrono::month;
-using year = std::chrono::year;
-using year_month_day = std::chrono::year_month_day;
 #else
 // A fallback version of weekday.
 class weekday {
  private:
-  unsigned char value_;
+  unsigned char value;
 
  public:
   weekday() = default;
-  constexpr explicit weekday(unsigned wd) noexcept
-      : value_(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
-  constexpr auto c_encoding() const noexcept -> unsigned { return value_; }
+  explicit constexpr weekday(unsigned wd) noexcept
+      : value(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
+  constexpr auto c_encoding() const noexcept -> unsigned { return value; }
 };
 
-class day {
- private:
-  unsigned char value_;
-
- public:
-  day() = default;
-  constexpr explicit day(unsigned d) noexcept
-      : value_(static_cast<unsigned char>(d)) {}
-  constexpr explicit operator unsigned() const noexcept { return value_; }
-};
-
-class month {
- private:
-  unsigned char value_;
-
- public:
-  month() = default;
-  constexpr explicit month(unsigned m) noexcept
-      : value_(static_cast<unsigned char>(m)) {}
-  constexpr explicit operator unsigned() const noexcept { return value_; }
-};
-
-class year {
- private:
-  int value_;
-
- public:
-  year() = default;
-  constexpr explicit year(int y) noexcept : value_(y) {}
-  constexpr explicit operator int() const noexcept { return value_; }
-};
-
-class year_month_day {
- private:
-  fmt::year year_;
-  fmt::month month_;
-  fmt::day day_;
-
- public:
-  year_month_day() = default;
-  constexpr year_month_day(const year& y, const month& m, const day& d) noexcept
-      : year_(y), month_(m), day_(d) {}
-  constexpr auto year() const noexcept -> fmt::year { return year_; }
-  constexpr auto month() const noexcept -> fmt::month { return month_; }
-  constexpr auto day() const noexcept -> fmt::day { return day_; }
-};
+class year_month_day {};
 #endif
 
-template <typename Char>
-struct formatter<weekday, Char> : private formatter<std::tm, Char> {
+// A rudimentary weekday formatter.
+template <typename Char> struct formatter<weekday, Char> {
  private:
-  bool localized_ = false;
-  bool use_tm_formatter_ = false;
+  bool localized = false;
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    if (it != end && *it == 'L') {
-      ++it;
-      localized_ = true;
-      return it;
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto begin = ctx.begin(), end = ctx.end();
+    if (begin != end && *begin == 'L') {
+      ++begin;
+      localized = true;
     }
-    use_tm_formatter_ = it != end && *it != '}';
-    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+    return begin;
   }
 
   template <typename FormatContext>
   auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) {
     auto time = std::tm();
     time.tm_wday = static_cast<int>(wd.c_encoding());
-    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
-    detail::get_locale loc(localized_, ctx.locale());
+    detail::get_locale loc(localized, ctx.locale());
     auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
     w.on_abbr_weekday();
     return w.out();
   }
 };
 
-template <typename Char>
-struct formatter<day, Char> : private formatter<std::tm, Char> {
- private:
-  bool use_tm_formatter_ = false;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    use_tm_formatter_ = it != end && *it != '}';
-    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
-  }
-
-  template <typename FormatContext>
-  auto format(day d, FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto time = std::tm();
-    time.tm_mday = static_cast<int>(static_cast<unsigned>(d));
-    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
-    detail::get_locale loc(false, ctx.locale());
-    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
-    w.on_day_of_month(detail::numeric_system::standard, detail::pad_type::zero);
-    return w.out();
-  }
-};
-
-template <typename Char>
-struct formatter<month, Char> : private formatter<std::tm, Char> {
- private:
-  bool localized_ = false;
-  bool use_tm_formatter_ = false;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    if (it != end && *it == 'L') {
-      ++it;
-      localized_ = true;
-      return it;
-    }
-    use_tm_formatter_ = it != end && *it != '}';
-    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
-  }
-
-  template <typename FormatContext>
-  auto format(month m, FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto time = std::tm();
-    time.tm_mon = static_cast<int>(static_cast<unsigned>(m)) - 1;
-    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
-    detail::get_locale loc(localized_, ctx.locale());
-    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
-    w.on_abbr_month();
-    return w.out();
-  }
-};
-
-template <typename Char>
-struct formatter<year, Char> : private formatter<std::tm, Char> {
- private:
-  bool use_tm_formatter_ = false;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    use_tm_formatter_ = it != end && *it != '}';
-    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
-  }
-
-  template <typename FormatContext>
-  auto format(year y, FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto time = std::tm();
-    time.tm_year = static_cast<int>(y) - 1900;
-    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
-    detail::get_locale loc(false, ctx.locale());
-    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
-    w.on_year(detail::numeric_system::standard, detail::pad_type::zero);
-    return w.out();
-  }
-};
-
-template <typename Char>
-struct formatter<year_month_day, Char> : private formatter<std::tm, Char> {
- private:
-  bool use_tm_formatter_ = false;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    use_tm_formatter_ = it != end && *it != '}';
-    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
-  }
-
-  template <typename FormatContext>
-  auto format(year_month_day val, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    auto time = std::tm();
-    time.tm_year = static_cast<int>(val.year()) - 1900;
-    time.tm_mon = static_cast<int>(static_cast<unsigned>(val.month())) - 1;
-    time.tm_mday = static_cast<int>(static_cast<unsigned>(val.day()));
-    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
-    detail::get_locale loc(true, ctx.locale());
-    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
-    w.on_iso_date();
-    return w.out();
-  }
-};
-
 template <typename Rep, typename Period, typename Char>
 struct formatter<std::chrono::duration<Rep, Period>, Char> {
  private:
-  format_specs specs_;
+  format_specs<Char> specs_;
   detail::arg_ref<Char> width_ref_;
   detail::arg_ref<Char> precision_ref_;
   bool localized_ = false;
-  basic_string_view<Char> fmt_;
+  basic_string_view<Char> format_str_;
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
     auto it = ctx.begin(), end = ctx.end();
     if (it == end || *it == '}') return it;
 
     it = detail::parse_align(it, end, specs_);
     if (it == end) return it;
 
-    Char c = *it;
-    if ((c >= '0' && c <= '9') || c == '{') {
-      it = detail::parse_width(it, end, specs_, width_ref_, ctx);
-      if (it == end) return it;
-    }
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
 
     auto checker = detail::chrono_format_checker();
     if (*it == '.') {
       checker.has_precision_integral = !std::is_floating_point<Rep>::value;
-      it = detail::parse_precision(it, end, specs_, precision_ref_, ctx);
+      it = detail::parse_precision(it, end, specs_.precision, precision_ref_,
+                                   ctx);
     }
     if (it != end && *it == 'L') {
       localized_ = true;
       ++it;
     }
     end = detail::parse_chrono_format(it, end, checker);
-    fmt_ = {it, detail::to_unsigned(end - it)};
+    format_str_ = {it, detail::to_unsigned(end - it)};
     return end;
   }
 
@@ -2191,15 +2082,15 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
     auto specs = specs_;
     auto precision = specs.precision;
     specs.precision = -1;
-    auto begin = fmt_.begin(), end = fmt_.end();
+    auto begin = format_str_.begin(), end = format_str_.end();
     // As a possible future optimization, we could avoid extra copying if width
     // is not specified.
     auto buf = basic_memory_buffer<Char>();
-    auto out = basic_appender<Char>(buf);
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width, width_ref_,
-                                ctx);
-    detail::handle_dynamic_spec(specs.dynamic_precision(), precision,
-                                precision_ref_, ctx);
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+                                                           precision_ref_, ctx);
     if (begin == end || *begin == '}') {
       out = detail::format_duration_value<Char>(out, d.count(), precision);
       detail::format_duration_unit<Char, Period>(out);
@@ -2216,119 +2107,130 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
   }
 };
 
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                 Char> : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (detail::const_check(
+            period::num != 1 || period::den != 1 ||
+            std::is_floating_point<typename Duration::rep>::value)) {
+      const auto epoch = val.time_since_epoch();
+      auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      if (subsecs.count() < 0) {
+        auto second =
+            detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+        if (epoch.count() < ((Duration::min)() + second).count())
+          FMT_THROW(format_error("duration is too small"));
+        subsecs += second;
+        val -= second;
+      }
+
+      return formatter<std::tm, Char>::do_format(gmtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(gmtime(val), ctx);
+  }
+};
+
+#if FMT_USE_LOCAL_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::local_time<Duration>, Char>
+    : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::local_time<Duration> val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (period::num != 1 || period::den != 1 ||
+        std::is_floating_point<typename Duration::rep>::value) {
+      const auto epoch = val.time_since_epoch();
+      const auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
+  }
+};
+#endif
+
+#if FMT_USE_UTC_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::utc_clock, Duration>,
+                 Char>
+    : formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::utc_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    return formatter<
+        std::chrono::time_point<std::chrono::system_clock, Duration>,
+        Char>::format(std::chrono::utc_clock::to_sys(val), ctx);
+  }
+};
+#endif
+
 template <typename Char> struct formatter<std::tm, Char> {
  private:
-  format_specs specs_;
+  format_specs<Char> specs_;
   detail::arg_ref<Char> width_ref_;
 
  protected:
-  basic_string_view<Char> fmt_;
+  basic_string_view<Char> format_str_;
 
-  template <typename Duration, typename FormatContext>
+  template <typename FormatContext, typename Duration>
   auto do_format(const std::tm& tm, FormatContext& ctx,
                  const Duration* subsecs) const -> decltype(ctx.out()) {
     auto specs = specs_;
     auto buf = basic_memory_buffer<Char>();
-    auto out = basic_appender<Char>(buf);
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width, width_ref_,
-                                ctx);
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
 
     auto loc_ref = ctx.locale();
     detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
     auto w =
         detail::tm_writer<decltype(out), Char, Duration>(loc, out, tm, subsecs);
-    detail::parse_chrono_format(fmt_.begin(), fmt_.end(), w);
+    detail::parse_chrono_format(format_str_.begin(), format_str_.end(), w);
     return detail::write(
         ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
   }
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
     auto it = ctx.begin(), end = ctx.end();
     if (it == end || *it == '}') return it;
 
     it = detail::parse_align(it, end, specs_);
     if (it == end) return it;
 
-    Char c = *it;
-    if ((c >= '0' && c <= '9') || c == '{') {
-      it = detail::parse_width(it, end, specs_, width_ref_, ctx);
-      if (it == end) return it;
-    }
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
 
     end = detail::parse_chrono_format(it, end, detail::tm_format_checker());
-    // Replace the default format string only if the new spec is not empty.
-    if (end != it) fmt_ = {it, detail::to_unsigned(end - it)};
+    // Replace the default format_str only if the new spec is not empty.
+    if (end != it) format_str_ = {it, detail::to_unsigned(end - it)};
     return end;
   }
 
   template <typename FormatContext>
   auto format(const std::tm& tm, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    return do_format<std::chrono::seconds>(tm, ctx, nullptr);
-  }
-};
-
-template <typename Char, typename Duration>
-struct formatter<sys_time<Duration>, Char> : formatter<std::tm, Char> {
-  FMT_CONSTEXPR formatter() {
-    this->fmt_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>();
-  }
-
-  template <typename FormatContext>
-  auto format(sys_time<Duration> val, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    std::tm tm = gmtime(val);
-    using period = typename Duration::period;
-    if (detail::const_check(
-            period::num == 1 && period::den == 1 &&
-            !std::is_floating_point<typename Duration::rep>::value)) {
-      return formatter<std::tm, Char>::format(tm, ctx);
-    }
-    Duration epoch = val.time_since_epoch();
-    Duration subsecs = detail::duration_cast<Duration>(
-        epoch - detail::duration_cast<std::chrono::seconds>(epoch));
-    if (subsecs.count() < 0) {
-      auto second = detail::duration_cast<Duration>(std::chrono::seconds(1));
-      if (tm.tm_sec != 0)
-        --tm.tm_sec;
-      else
-        tm = gmtime(val - second);
-      subsecs += detail::duration_cast<Duration>(std::chrono::seconds(1));
-    }
-    return formatter<std::tm, Char>::do_format(tm, ctx, &subsecs);
-  }
-};
-
-template <typename Duration, typename Char>
-struct formatter<utc_time<Duration>, Char>
-    : formatter<sys_time<Duration>, Char> {
-  template <typename FormatContext>
-  auto format(utc_time<Duration> val, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    return formatter<sys_time<Duration>, Char>::format(
-        detail::utc_clock::to_sys(val), ctx);
-  }
-};
-
-template <typename Duration, typename Char>
-struct formatter<local_time<Duration>, Char> : formatter<std::tm, Char> {
-  FMT_CONSTEXPR formatter() {
-    this->fmt_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>();
-  }
-
-  template <typename FormatContext>
-  auto format(local_time<Duration> val, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    using period = typename Duration::period;
-    if (period::num == 1 && period::den == 1 &&
-        !std::is_floating_point<typename Duration::rep>::value) {
-      return formatter<std::tm, Char>::format(localtime(val), ctx);
-    }
-    auto epoch = val.time_since_epoch();
-    auto subsecs = detail::duration_cast<Duration>(
-        epoch - detail::duration_cast<std::chrono::seconds>(epoch));
-    return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
+    return do_format<FormatContext, std::chrono::seconds>(tm, ctx, nullptr);
   }
 };
 
diff --git a/src/fmt/color.h b/src/fmt/color.h
index 2faaf3a067..464519e582 100644
--- a/src/fmt/color.h
+++ b/src/fmt/color.h
@@ -227,7 +227,7 @@ struct color_type {
 };
 }  // namespace detail
 
-/// A text style consisting of foreground and background colors and emphasis.
+/** A text style consisting of foreground and background colors and emphasis. */
 class text_style {
  public:
   FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept
@@ -239,7 +239,7 @@ class text_style {
       foreground_color = rhs.foreground_color;
     } else if (rhs.set_foreground_color) {
       if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
-        report_error("can't OR a terminal color");
+        FMT_THROW(format_error("can't OR a terminal color"));
       foreground_color.value.rgb_color |= rhs.foreground_color.value.rgb_color;
     }
 
@@ -248,7 +248,7 @@ class text_style {
       background_color = rhs.background_color;
     } else if (rhs.set_background_color) {
       if (!background_color.is_rgb || !rhs.background_color.is_rgb)
-        report_error("can't OR a terminal color");
+        FMT_THROW(format_error("can't OR a terminal color"));
       background_color.value.rgb_color |= rhs.background_color.value.rgb_color;
     }
 
@@ -310,13 +310,13 @@ class text_style {
   emphasis ems;
 };
 
-/// Creates a text style from the foreground (text) color.
+/** Creates a text style from the foreground (text) color. */
 FMT_CONSTEXPR inline auto fg(detail::color_type foreground) noexcept
     -> text_style {
   return text_style(true, foreground);
 }
 
-/// Creates a text style from the background color.
+/** Creates a text style from the background color. */
 FMT_CONSTEXPR inline auto bg(detail::color_type background) noexcept
     -> text_style {
   return text_style(false, background);
@@ -330,7 +330,7 @@ FMT_CONSTEXPR inline auto operator|(emphasis lhs, emphasis rhs) noexcept
 namespace detail {
 
 template <typename Char> struct ansi_color_escape {
-  FMT_CONSTEXPR ansi_color_escape(color_type text_color,
+  FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
                                   const char* esc) noexcept {
     // If we have a terminal color, we need to output another escape code
     // sequence.
@@ -391,7 +391,7 @@ template <typename Char> struct ansi_color_escape {
 
   FMT_CONSTEXPR auto begin() const noexcept -> const Char* { return buffer; }
   FMT_CONSTEXPR20 auto end() const noexcept -> const Char* {
-    return buffer + basic_string_view<Char>(buffer).size();
+    return buffer + std::char_traits<Char>::length(buffer);
   }
 
  private:
@@ -412,13 +412,13 @@ template <typename Char> struct ansi_color_escape {
 };
 
 template <typename Char>
-FMT_CONSTEXPR auto make_foreground_color(color_type foreground) noexcept
+FMT_CONSTEXPR auto make_foreground_color(detail::color_type foreground) noexcept
     -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
 }
 
 template <typename Char>
-FMT_CONSTEXPR auto make_background_color(color_type background) noexcept
+FMT_CONSTEXPR auto make_background_color(detail::color_type background) noexcept
     -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(background, "\x1b[48;2;");
 }
@@ -434,7 +434,7 @@ template <typename Char> inline void reset_color(buffer<Char>& buffer) {
   buffer.append(reset_color.begin(), reset_color.end());
 }
 
-template <typename T> struct styled_arg : view {
+template <typename T> struct styled_arg : detail::view {
   const T& value;
   text_style style;
   styled_arg(const T& v, text_style s) : value(v), style(s) {}
@@ -442,115 +442,145 @@ template <typename T> struct styled_arg : view {
 
 template <typename Char>
 void vformat_to(buffer<Char>& buf, const text_style& ts,
-                basic_string_view<Char> fmt,
-                basic_format_args<buffered_context<Char>> args) {
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   bool has_style = false;
   if (ts.has_emphasis()) {
     has_style = true;
-    auto emphasis = make_emphasis<Char>(ts.get_emphasis());
+    auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
     buf.append(emphasis.begin(), emphasis.end());
   }
   if (ts.has_foreground()) {
     has_style = true;
-    auto foreground = make_foreground_color<Char>(ts.get_foreground());
+    auto foreground = detail::make_foreground_color<Char>(ts.get_foreground());
     buf.append(foreground.begin(), foreground.end());
   }
   if (ts.has_background()) {
     has_style = true;
-    auto background = make_background_color<Char>(ts.get_background());
+    auto background = detail::make_background_color<Char>(ts.get_background());
     buf.append(background.begin(), background.end());
   }
-  vformat_to(buf, fmt, args);
-  if (has_style) reset_color<Char>(buf);
+  detail::vformat_to(buf, format_str, args, {});
+  if (has_style) detail::reset_color<Char>(buf);
 }
+
 }  // namespace detail
 
-inline void vprint(FILE* f, const text_style& ts, string_view fmt,
+inline void vprint(std::FILE* f, const text_style& ts, string_view fmt,
                    format_args args) {
+  // Legacy wide streams are not supported.
   auto buf = memory_buffer();
   detail::vformat_to(buf, ts, fmt, args);
-  print(f, FMT_STRING("{}"), string_view(buf.begin(), buf.size()));
+  if (detail::is_utf8()) {
+    detail::print(f, string_view(buf.begin(), buf.size()));
+    return;
+  }
+  buf.push_back('\0');
+  int result = std::fputs(buf.data(), f);
+  if (result < 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
 }
 
 /**
- * Formats a string and prints it to the specified file stream using ANSI
- * escape sequences to specify text formatting.
- *
- * **Example**:
- *
- *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
- *                "Elapsed time: {0:.2f} seconds", 1.23);
+  \rst
+  Formats a string and prints it to the specified file stream using ANSI
+  escape sequences to specify text formatting.
+
+  **Example**::
+
+    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+               "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
  */
-template <typename... T>
-void print(FILE* f, const text_style& ts, format_string<T...> fmt,
-           T&&... args) {
-  vprint(f, ts, fmt.str, vargs<T...>{{args...}});
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
+void print(std::FILE* f, const text_style& ts, const S& format_str,
+           const Args&... args) {
+  vprint(f, ts, format_str,
+         fmt::make_format_args<buffer_context<char_t<S>>>(args...));
 }
 
 /**
- * Formats a string and prints it to stdout using ANSI escape sequences to
- * specify text formatting.
- *
- * **Example**:
- *
- *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
- *                "Elapsed time: {0:.2f} seconds", 1.23);
+  \rst
+  Formats a string and prints it to stdout using ANSI escape sequences to
+  specify text formatting.
+
+  **Example**::
+
+    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+               "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
  */
-template <typename... T>
-void print(const text_style& ts, format_string<T...> fmt, T&&... args) {
-  return print(stdout, ts, fmt, std::forward<T>(args)...);
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
+void print(const text_style& ts, const S& format_str, const Args&... args) {
+  return print(stdout, ts, format_str, args...);
 }
 
-inline auto vformat(const text_style& ts, string_view fmt, format_args args)
-    -> std::string {
-  auto buf = memory_buffer();
-  detail::vformat_to(buf, ts, fmt, args);
+template <typename S, typename Char = char_t<S>>
+inline auto vformat(
+    const text_style& ts, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  basic_memory_buffer<Char> buf;
+  detail::vformat_to(buf, ts, detail::to_string_view(format_str), args);
   return fmt::to_string(buf);
 }
 
 /**
- * Formats arguments and returns the result as a string using ANSI escape
- * sequences to specify text formatting.
- *
- * **Example**:
- *
- * ```
- * #include <fmt/color.h>
- * std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
- *                                   "The answer is {}", 42);
- * ```
- */
-template <typename... T>
-inline auto format(const text_style& ts, format_string<T...> fmt, T&&... args)
-    -> std::string {
-  return fmt::vformat(ts, fmt.str, vargs<T...>{{args...}});
+  \rst
+  Formats arguments and returns the result as a string using ANSI
+  escape sequences to specify text formatting.
+
+  **Example**::
+
+    #include <fmt/color.h>
+    std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                      "The answer is {}", 42);
+  \endrst
+*/
+template <typename S, typename... Args, typename Char = char_t<S>>
+inline auto format(const text_style& ts, const S& format_str,
+                   const Args&... args) -> std::basic_string<Char> {
+  return fmt::vformat(ts, detail::to_string_view(format_str),
+                      fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
-/// Formats a string with the given text_style and writes the output to `out`.
-template <typename OutputIt,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to(OutputIt out, const text_style& ts, string_view fmt,
-                format_args args) -> OutputIt {
-  auto&& buf = detail::get_buffer<char>(out);
-  detail::vformat_to(buf, ts, fmt, args);
+/**
+  Formats a string with the given text_style and writes the output to ``out``.
+ */
+template <typename OutputIt, typename Char,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+auto vformat_to(OutputIt out, const text_style& ts,
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  detail::vformat_to(buf, ts, format_str, args);
   return detail::get_iterator(buf, out);
 }
 
 /**
- * Formats arguments with the given text style, writes the result to the output
- * iterator `out` and returns the iterator past the end of the output range.
- *
- * **Example**:
- *
- *     std::vector<char> out;
- *     fmt::format_to(std::back_inserter(out),
- *                    fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
- */
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-inline auto format_to(OutputIt out, const text_style& ts,
-                      format_string<T...> fmt, T&&... args) -> OutputIt {
-  return vformat_to(out, ts, fmt.str, vargs<T...>{{args...}});
+  \rst
+  Formats arguments with the given text_style, writes the result to the output
+  iterator ``out`` and returns the iterator past the end of the output range.
+
+  **Example**::
+
+    std::vector<char> out;
+    fmt::format_to(std::back_inserter(out),
+                   fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+  \endrst
+*/
+template <
+    typename OutputIt, typename S, typename... Args,
+    bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value &&
+                  detail::is_string<S>::value>
+inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
+                      Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  return vformat_to(out, ts, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffer_context<char_t<S>>>(args...));
 }
 
 template <typename T, typename Char>
@@ -559,44 +589,47 @@ struct formatter<detail::styled_arg<T>, Char> : formatter<T, Char> {
   auto format(const detail::styled_arg<T>& arg, FormatContext& ctx) const
       -> decltype(ctx.out()) {
     const auto& ts = arg.style;
+    const auto& value = arg.value;
     auto out = ctx.out();
 
     bool has_style = false;
     if (ts.has_emphasis()) {
       has_style = true;
       auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
-      out = detail::copy<Char>(emphasis.begin(), emphasis.end(), out);
+      out = std::copy(emphasis.begin(), emphasis.end(), out);
     }
     if (ts.has_foreground()) {
       has_style = true;
       auto foreground =
           detail::make_foreground_color<Char>(ts.get_foreground());
-      out = detail::copy<Char>(foreground.begin(), foreground.end(), out);
+      out = std::copy(foreground.begin(), foreground.end(), out);
     }
     if (ts.has_background()) {
       has_style = true;
       auto background =
           detail::make_background_color<Char>(ts.get_background());
-      out = detail::copy<Char>(background.begin(), background.end(), out);
+      out = std::copy(background.begin(), background.end(), out);
     }
-    out = formatter<T, Char>::format(arg.value, ctx);
+    out = formatter<T, Char>::format(value, ctx);
     if (has_style) {
       auto reset_color = string_view("\x1b[0m");
-      out = detail::copy<Char>(reset_color.begin(), reset_color.end(), out);
+      out = std::copy(reset_color.begin(), reset_color.end(), out);
     }
     return out;
   }
 };
 
 /**
- * Returns an argument that will be formatted using ANSI escape sequences,
- * to be used in a formatting function.
- *
- * **Example**:
- *
- *     fmt::print("Elapsed time: {0:.2f} seconds",
- *                fmt::styled(1.23, fmt::fg(fmt::color::green) |
- *                                  fmt::bg(fmt::color::blue)));
+  \rst
+  Returns an argument that will be formatted using ANSI escape sequences,
+  to be used in a formatting function.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {0:.2f} seconds",
+               fmt::styled(1.23, fmt::fg(fmt::color::green) |
+                                 fmt::bg(fmt::color::blue)));
+  \endrst
  */
 template <typename T>
 FMT_CONSTEXPR auto styled(const T& value, text_style ts)
diff --git a/src/fmt/compile.h b/src/fmt/compile.h
index 68b451c71d..71fa69c67e 100644
--- a/src/fmt/compile.h
+++ b/src/fmt/compile.h
@@ -8,44 +8,49 @@
 #ifndef FMT_COMPILE_H_
 #define FMT_COMPILE_H_
 
-#ifndef FMT_MODULE
-#  include <iterator>  // std::back_inserter
-#endif
-
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename Char, typename InputIt>
+FMT_CONSTEXPR inline auto copy_str(InputIt begin, InputIt end,
+                                   counting_iterator it) -> counting_iterator {
+  return it + (end - begin);
+}
 
 // A compile-time string which is compiled into fast formatting code.
-FMT_EXPORT class compiled_string {};
-
-namespace detail {
+class compiled_string {};
 
 template <typename S>
 struct is_compiled_string : std::is_base_of<compiled_string, S> {};
 
 /**
- * Converts a string literal `s` into a format string that will be parsed at
- * compile time and converted into efficient formatting code. Requires C++17
- * `constexpr if` compiler support.
- *
- * **Example**:
- *
- *     // Converts 42 into std::string using the most efficient method and no
- *     // runtime format string processing.
- *     std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+  \rst
+  Converts a string literal *s* into a format string that will be parsed at
+  compile time and converted into efficient formatting code. Requires C++17
+  ``constexpr if`` compiler support.
+
+  **Example**::
+
+    // Converts 42 into std::string using the most efficient method and no
+    // runtime format string processing.
+    std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+  \endrst
  */
 #if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
-#  define FMT_COMPILE(s) FMT_STRING_IMPL(s, fmt::compiled_string)
+#  define FMT_COMPILE(s) \
+    FMT_STRING_IMPL(s, fmt::detail::compiled_string, explicit)
 #else
 #  define FMT_COMPILE(s) FMT_STRING(s)
 #endif
 
 #if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <typename Char, size_t N, fmt::detail::fixed_string<Char, N> Str>
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
 struct udl_compiled_string : compiled_string {
   using char_type = Char;
-  constexpr explicit operator basic_string_view<char_type>() const {
+  explicit constexpr operator basic_string_view<char_type>() const {
     return {Str.data, N - 1};
   }
 };
@@ -56,7 +61,8 @@ auto first(const T& value, const Tail&...) -> const T& {
   return value;
 }
 
-#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+// LAMMPS customization: only use 'if constexpr' with C++17
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction) && (FMT_CPLUSPLUS >= 201703L)
 template <typename... Args> struct type_list {};
 
 // Returns a reference to the argument at index N from [first, rest...].
@@ -70,29 +76,6 @@ constexpr const auto& get([[maybe_unused]] const T& first,
     return detail::get<N - 1>(rest...);
 }
 
-#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <int N, typename T, typename... Args, typename Char>
-constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-  if constexpr (is_static_named_arg<T>()) {
-    if (name == T::name) return N;
-  }
-  if constexpr (sizeof...(Args) > 0)
-    return get_arg_index_by_name<N + 1, Args...>(name);
-  (void)name;  // Workaround an MSVC bug about "unused" parameter.
-  return -1;
-}
-#  endif
-
-template <typename... Args, typename Char>
-FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
-  if constexpr (sizeof...(Args) > 0)
-    return get_arg_index_by_name<0, Args...>(name);
-#  endif
-  (void)name;
-  return -1;
-}
-
 template <typename Char, typename... Args>
 constexpr int get_arg_index_by_name(basic_string_view<Char> name,
                                     type_list<Args...>) {
@@ -162,12 +145,11 @@ template <typename Char, typename T, int N> struct field {
   template <typename OutputIt, typename... Args>
   constexpr OutputIt format(OutputIt out, const Args&... args) const {
     const T& arg = get_arg_checked<T, N>(args...);
-    if constexpr (std::is_convertible<T, basic_string_view<Char>>::value) {
+    if constexpr (std::is_convertible_v<T, basic_string_view<Char>>) {
       auto s = basic_string_view<Char>(arg);
-      return copy<Char>(s.begin(), s.end(), out);
-    } else {
-      return write<Char>(out, arg);
+      return copy_str<Char>(s.begin(), s.end(), out);
     }
+    return write<Char>(out, arg);
   }
 };
 
@@ -255,12 +237,13 @@ constexpr size_t parse_text(basic_string_view<Char> str, size_t pos) {
 }
 
 template <typename Args, size_t POS, int ID, typename S>
-constexpr auto compile_format_string(S fmt);
+constexpr auto compile_format_string(S format_str);
 
 template <typename Args, size_t POS, int ID, typename T, typename S>
-constexpr auto parse_tail(T head, S fmt) {
-  if constexpr (POS != basic_string_view<typename S::char_type>(fmt).size()) {
-    constexpr auto tail = compile_format_string<Args, POS, ID>(fmt);
+constexpr auto parse_tail(T head, S format_str) {
+  if constexpr (POS !=
+                basic_string_view<typename S::char_type>(format_str).size()) {
+    constexpr auto tail = compile_format_string<Args, POS, ID>(format_str);
     if constexpr (std::is_same<remove_cvref_t<decltype(tail)>,
                                unknown_format>())
       return tail;
@@ -292,7 +275,6 @@ constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
 }
 
 template <typename Char> struct arg_id_handler {
-  arg_id_kind kind;
   arg_ref<Char> arg_id;
 
   constexpr int on_auto() {
@@ -300,28 +282,25 @@ template <typename Char> struct arg_id_handler {
     return 0;
   }
   constexpr int on_index(int id) {
-    kind = arg_id_kind::index;
     arg_id = arg_ref<Char>(id);
     return 0;
   }
   constexpr int on_name(basic_string_view<Char> id) {
-    kind = arg_id_kind::name;
     arg_id = arg_ref<Char>(id);
     return 0;
   }
 };
 
 template <typename Char> struct parse_arg_id_result {
-  arg_id_kind kind;
   arg_ref<Char> arg_id;
   const Char* arg_id_end;
 };
 
 template <int ID, typename Char>
 constexpr auto parse_arg_id(const Char* begin, const Char* end) {
-  auto handler = arg_id_handler<Char>{arg_id_kind::none, arg_ref<Char>{}};
+  auto handler = arg_id_handler<Char>{arg_ref<Char>{}};
   auto arg_id_end = parse_arg_id(begin, end, handler);
-  return parse_arg_id_result<Char>{handler.kind, handler.arg_id, arg_id_end};
+  return parse_arg_id_result<Char>{handler.arg_id, arg_id_end};
 }
 
 template <typename T, typename Enable = void> struct field_type {
@@ -335,13 +314,14 @@ struct field_type<T, enable_if_t<detail::is_named_arg<T>::value>> {
 
 template <typename T, typename Args, size_t END_POS, int ARG_INDEX, int NEXT_ID,
           typename S>
-constexpr auto parse_replacement_field_then_tail(S fmt) {
+constexpr auto parse_replacement_field_then_tail(S format_str) {
   using char_type = typename S::char_type;
-  constexpr auto str = basic_string_view<char_type>(fmt);
+  constexpr auto str = basic_string_view<char_type>(format_str);
   constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type();
   if constexpr (c == '}') {
     return parse_tail<Args, END_POS + 1, NEXT_ID>(
-        field<char_type, typename field_type<T>::type, ARG_INDEX>(), fmt);
+        field<char_type, typename field_type<T>::type, ARG_INDEX>(),
+        format_str);
   } else if constexpr (c != ':') {
     FMT_THROW(format_error("expected ':'"));
   } else {
@@ -354,7 +334,7 @@ constexpr auto parse_replacement_field_then_tail(S fmt) {
       return parse_tail<Args, result.end + 1, result.next_arg_id>(
           spec_field<char_type, typename field_type<T>::type, ARG_INDEX>{
               result.fmt},
-          fmt);
+          format_str);
     }
   }
 }
@@ -362,21 +342,22 @@ constexpr auto parse_replacement_field_then_tail(S fmt) {
 // Compiles a non-empty format string and returns the compiled representation
 // or unknown_format() on unrecognized input.
 template <typename Args, size_t POS, int ID, typename S>
-constexpr auto compile_format_string(S fmt) {
+constexpr auto compile_format_string(S format_str) {
   using char_type = typename S::char_type;
-  constexpr auto str = basic_string_view<char_type>(fmt);
+  constexpr auto str = basic_string_view<char_type>(format_str);
   if constexpr (str[POS] == '{') {
     if constexpr (POS + 1 == str.size())
       FMT_THROW(format_error("unmatched '{' in format string"));
     if constexpr (str[POS + 1] == '{') {
-      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
+      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
     } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') {
       static_assert(ID != manual_indexing_id,
                     "cannot switch from manual to automatic argument indexing");
       constexpr auto next_id =
           ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
       return parse_replacement_field_then_tail<get_type<ID, Args>, Args,
-                                               POS + 1, ID, next_id>(fmt);
+                                               POS + 1, ID, next_id>(
+          format_str);
     } else {
       constexpr auto arg_id_result =
           parse_arg_id<ID>(str.data() + POS + 1, str.data() + str.size());
@@ -384,27 +365,28 @@ constexpr auto compile_format_string(S fmt) {
       constexpr char_type c =
           arg_id_end_pos != str.size() ? str[arg_id_end_pos] : char_type();
       static_assert(c == '}' || c == ':', "missing '}' in format string");
-      if constexpr (arg_id_result.kind == arg_id_kind::index) {
+      if constexpr (arg_id_result.arg_id.kind == arg_id_kind::index) {
         static_assert(
             ID == manual_indexing_id || ID == 0,
             "cannot switch from automatic to manual argument indexing");
-        constexpr auto arg_index = arg_id_result.arg_id.index;
+        constexpr auto arg_index = arg_id_result.arg_id.val.index;
         return parse_replacement_field_then_tail<get_type<arg_index, Args>,
                                                  Args, arg_id_end_pos,
                                                  arg_index, manual_indexing_id>(
-            fmt);
-      } else if constexpr (arg_id_result.kind == arg_id_kind::name) {
+            format_str);
+      } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) {
         constexpr auto arg_index =
-            get_arg_index_by_name(arg_id_result.arg_id.name, Args{});
+            get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{});
         if constexpr (arg_index >= 0) {
           constexpr auto next_id =
               ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
           return parse_replacement_field_then_tail<
               decltype(get_type<arg_index, Args>::value), Args, arg_id_end_pos,
-              arg_index, next_id>(fmt);
+              arg_index, next_id>(format_str);
         } else if constexpr (c == '}') {
           return parse_tail<Args, arg_id_end_pos + 1, ID>(
-              runtime_named_field<char_type>{arg_id_result.arg_id.name}, fmt);
+              runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
+              format_str);
         } else if constexpr (c == ':') {
           return unknown_format();  // no type info for specs parsing
         }
@@ -413,26 +395,29 @@ constexpr auto compile_format_string(S fmt) {
   } else if constexpr (str[POS] == '}') {
     if constexpr (POS + 1 == str.size())
       FMT_THROW(format_error("unmatched '}' in format string"));
-    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
+    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
   } else {
     constexpr auto end = parse_text(str, POS + 1);
     if constexpr (end - POS > 1) {
-      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS), fmt);
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
+                                       format_str);
     } else {
-      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]}, fmt);
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]},
+                                       format_str);
     }
   }
 }
 
 template <typename... Args, typename S,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-constexpr auto compile(S fmt) {
-  constexpr auto str = basic_string_view<typename S::char_type>(fmt);
+constexpr auto compile(S format_str) {
+  constexpr auto str = basic_string_view<typename S::char_type>(format_str);
   if constexpr (str.size() == 0) {
     return detail::make_text(str, 0, 0);
   } else {
     constexpr auto result =
-        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(fmt);
+        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(
+            format_str);
     return result;
   }
 }
@@ -504,40 +489,40 @@ FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
 
 template <typename OutputIt, typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-auto format_to_n(OutputIt out, size_t n, const S& fmt, Args&&... args)
+auto format_to_n(OutputIt out, size_t n, const S& format_str, Args&&... args)
     -> format_to_n_result<OutputIt> {
   using traits = detail::fixed_buffer_traits;
   auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
-  fmt::format_to(std::back_inserter(buf), fmt, std::forward<Args>(args)...);
+  fmt::format_to(std::back_inserter(buf), format_str,
+                 std::forward<Args>(args)...);
   return {buf.out(), buf.count()};
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-FMT_CONSTEXPR20 auto formatted_size(const S& fmt, const Args&... args)
+FMT_CONSTEXPR20 auto formatted_size(const S& format_str, const Args&... args)
     -> size_t {
-  auto buf = detail::counting_buffer<>();
-  fmt::format_to(appender(buf), fmt, args...);
-  return buf.count();
+  return fmt::format_to(detail::counting_iterator(), format_str, args...)
+      .count();
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-void print(std::FILE* f, const S& fmt, const Args&... args) {
-  auto buf = memory_buffer();
-  fmt::format_to(appender(buf), fmt, args...);
-  detail::print(f, {buf.data(), buf.size()});
+void print(std::FILE* f, const S& format_str, const Args&... args) {
+  memory_buffer buffer;
+  fmt::format_to(std::back_inserter(buffer), format_str, args...);
+  detail::print(f, {buffer.data(), buffer.size()});
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-void print(const S& fmt, const Args&... args) {
-  print(stdout, fmt, args...);
+void print(const S& format_str, const Args&... args) {
+  print(stdout, format_str, args...);
 }
 
 #if FMT_USE_NONTYPE_TEMPLATE_ARGS
 inline namespace literals {
-template <detail::fixed_string Str> constexpr auto operator""_cf() {
+template <detail_exported::fixed_string Str> constexpr auto operator""_cf() {
   using char_t = remove_cvref_t<decltype(Str.data[0])>;
   return detail::udl_compiled_string<char_t, sizeof(Str.data) / sizeof(char_t),
                                      Str>();
diff --git a/src/fmt/core.h b/src/fmt/core.h
index 8ca735f0c0..6a53b8c52c 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -1,5 +1,2963 @@
-// This file is only provided for compatibility and may be removed in future
-// versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h
-// otherwise.
+// Formatting library for C++ - the core API for char/UTF-8
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
 
-#include "format.h"
+#ifndef FMT_CORE_H_
+#define FMT_CORE_H_
+
+#include <cstddef>      // std::byte
+#include <cstdio>       // std::FILE
+#include <cstring>      // std::strlen
+#include <limits.h>     // CHAR_BIT
+#include <string>       // std::string
+#include <type_traits>  // std::enable_if
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 100200
+
+#if defined(__clang__) && !defined(__ibmxl__)
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
+    !defined(__NVCOMPILER)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+
+#ifndef FMT_GCC_PRAGMA
+// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884.
+#  if FMT_GCC_VERSION >= 504
+#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
+#  else
+#    define FMT_GCC_PRAGMA(arg)
+#  endif
+#endif
+
+#ifdef __ICL
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+
+#ifdef _MSC_VER
+#  define FMT_MSC_VERSION _MSC_VER
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
+#else
+#  define FMT_MSC_VERSION 0
+#  define FMT_MSC_WARNING(...)
+#endif
+
+#ifdef _GLIBCXX_RELEASE
+#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
+#else
+#  define FMT_GLIBCXX_RELEASE 0
+#endif
+
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+
+#if defined(__has_include) || FMT_ICC_VERSION >= 1600 || FMT_MSC_VERSION > 1900
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#ifndef FMT_DEPRECATED
+#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
+#    define FMT_DEPRECATED [[deprecated]]
+#  else
+#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
+#      define FMT_DEPRECATED __attribute__((deprecated))
+#    elif FMT_MSC_VERSION
+#      define FMT_DEPRECATED __declspec(deprecated)
+#    else
+#      define FMT_DEPRECATED /* deprecated */
+#    endif
+#  endif
+#endif
+
+// Check if relaxed C++14 constexpr is supported.
+// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
+#ifndef FMT_USE_CONSTEXPR
+#  if (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912 || \
+       (FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L)) &&             \
+      !FMT_ICC_VERSION && (!defined(__NVCC__) || FMT_CPLUSPLUS >= 202002L)
+#    define FMT_USE_CONSTEXPR 1
+#  else
+#    define FMT_USE_CONSTEXPR 0
+#  endif
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#else
+#  define FMT_CONSTEXPR
+#endif
+
+#if (FMT_CPLUSPLUS >= 202002L ||                                \
+     (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)) &&  \
+    ((!FMT_GLIBCXX_RELEASE || FMT_GLIBCXX_RELEASE >= 10) &&     \
+     (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION >= 10000) && \
+     (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1928)) &&          \
+    defined(__cpp_lib_is_constant_evaluated)
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEXPR20
+#endif
+
+// Check if exceptions are disabled.
+#ifndef FMT_EXCEPTIONS
+#  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
+      (FMT_MSC_VERSION && !_HAS_EXCEPTIONS)
+#    define FMT_EXCEPTIONS 0
+#  else
+#    define FMT_EXCEPTIONS 1
+#  endif
+#endif
+
+// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
+#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && \
+    !defined(__NVCC__)
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifndef FMT_NODISCARD
+#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#    define FMT_NODISCARD [[nodiscard]]
+#  else
+#    define FMT_NODISCARD
+#  endif
+#endif
+
+#ifndef FMT_INLINE
+#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#    define FMT_INLINE inline __attribute__((always_inline))
+#  else
+#    define FMT_INLINE inline
+#  endif
+#endif
+
+#ifdef _MSC_VER
+#  define FMT_UNCHECKED_ITERATOR(It) \
+    using _Unchecked_type = It  // Mark iterator as checked.
+#else
+#  define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
+#endif
+
+// LAMMPS customization
+// use 'v10_lmp' namespace instead of 'v10' so that our
+// bundled copy does not collide with linking other code
+// using system wide installations which may be using
+// a different version.
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    inline namespace v10_lmp {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_EXPORT
+#  define FMT_EXPORT
+#  define FMT_BEGIN_EXPORT
+#  define FMT_END_EXPORT
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+#  if defined(FMT_LIB_EXPORT)
+#    define FMT_API __declspec(dllexport)
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE !FMT_MSC_VERSION
+#endif
+
+#ifndef FMT_CONSTEVAL
+#  if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) && \
+       (!defined(__apple_build_version__) ||                     \
+        __apple_build_version__ >= 14000029L) &&                 \
+       FMT_CPLUSPLUS >= 202002L) ||                              \
+      (defined(__cpp_consteval) &&                               \
+       (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1929))
+// consteval is broken in MSVC before VS2019 version 16.10 and Apple clang
+// before 14.
+#    define FMT_CONSTEVAL consteval
+#    define FMT_HAS_CONSTEVAL
+#  else
+#    define FMT_CONSTEVAL
+#  endif
+#endif
+
+#ifndef FMT_USE_NONTYPE_TEMPLATE_ARGS
+#  if defined(__cpp_nontype_template_args) &&                  \
+      ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \
+       __cpp_nontype_template_args >= 201911L) &&              \
+      !defined(__NVCOMPILER) && !defined(__LCC__)
+#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#  else
+#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#  endif
+#endif
+
+// GCC < 5 requires this-> in decltype.
+#ifndef FMT_DECLTYPE_THIS
+#  if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+#    define FMT_DECLTYPE_THIS this->
+#  else
+#    define FMT_DECLTYPE_THIS
+#  endif
+#endif
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_GCC_PRAGMA("GCC push_options")
+#if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__) && \
+    !defined(__CUDACC__)
+FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T> struct type_identity {
+  using type = T;
+};
+template <typename T> using type_identity_t = typename type_identity<T>::type;
+template <typename T>
+using underlying_t = typename std::underlying_type<T>::type;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
+
+struct monostate {
+  constexpr monostate() {}
+};
+
+// An implementation of back_insert_iterator to avoid dependency on <iterator>.
+template <typename Container> class back_insert_iterator {
+ private:
+  Container* container_;
+
+  friend auto get_container(back_insert_iterator it) -> Container& {
+    return *it.container_;
+  }
+
+ public:
+  using difference_type = ptrdiff_t;
+  FMT_UNCHECKED_ITERATOR(back_insert_iterator);
+
+  explicit back_insert_iterator(Container& c) : container_(&c) {}
+
+  auto operator=(const typename Container::value_type& value)
+      -> back_insert_iterator& {
+    container_->push_back(value);
+    return *this;
+  }
+  auto operator*() -> back_insert_iterator& { return *this; }
+  auto operator++() -> back_insert_iterator& { return *this; }
+  auto operator++(int) -> back_insert_iterator { return *this; }
+};
+
+template <typename Container>
+auto back_inserter(Container& c) -> back_insert_iterator<Container> {
+  return {c};
+}
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
+#endif
+
+// This is defined in core.h instead of format.h to avoid injecting in std.
+// It is a template to avoid undesirable implicit conversions to std::byte.
+#ifdef __cpp_lib_byte
+template <typename T, FMT_ENABLE_IF(std::is_same<T, std::byte>::value)>
+inline auto format_as(T b) -> unsigned char {
+  return static_cast<unsigned char>(b);
+}
+#endif
+
+namespace detail {
+// Suppresses "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr FMT_INLINE auto is_constant_evaluated(
+    bool default_value = false) noexcept -> bool {
+// Workaround for incompatibility between libstdc++ consteval-based
+// std::is_constant_evaluated() implementation and clang-14:
+// https://github.com/fmtlib/fmt/issues/3247.
+#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
+    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
+  ignore_unused(default_value);
+  return __builtin_is_constant_evaluated();
+#elif defined(__cpp_lib_is_constant_evaluated)
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// Suppresses "conditional expression is constant" warnings.
+template <typename T> constexpr FMT_INLINE auto const_check(T value) -> T {
+  return value;
+}
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#ifndef FMT_ASSERT
+#  ifdef NDEBUG
+// FMT_ASSERT is not empty to avoid -Wempty-body.
+#    define FMT_ASSERT(condition, message) \
+      fmt::detail::ignore_unused((condition), (message))
+#  else
+#    define FMT_ASSERT(condition, message)                                    \
+      ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+           ? (void)0                                                          \
+           : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#  endif
+#endif
+
+#ifdef FMT_USE_INT128
+// Do nothing.
+#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
+#  define FMT_USE_INT128 1
+using int128_opt = __int128_t;  // An optional native 128-bit integer.
+using uint128_opt = __uint128_t;
+template <typename T> inline auto convert_for_visit(T value) -> T {
+  return value;
+}
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+enum class int128_opt {};
+enum class uint128_opt {};
+// Reduce template instantiations.
+template <typename T> auto convert_for_visit(T) -> monostate { return {}; }
+#endif
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR auto to_unsigned(Int value) ->
+    typename std::make_unsigned<Int>::type {
+#if 0
+  // LAMMPS customization: disable assertion to avoid bogus warnings
+  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
+#endif
+  return static_cast<typename std::make_unsigned<Int>::type>(value);
+}
+
+template <typename T, typename Enable = void>
+struct is_string_like : std::false_type {};
+
+// A heuristic to detect std::string and std::string_view.
+template <typename T>
+struct is_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
+                             typename T::value_type(), 0))>> : std::true_type {
+};
+
+FMT_CONSTEXPR inline auto is_utf8() -> bool {
+  FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char section[] = "\u00A7";
+
+  // Avoid buggy sign extensions in MSVC's constant evaluation mode (#2297).
+  using uchar = unsigned char;
+  return FMT_UNICODE || (sizeof(section) == 3 && uchar(section[0]) == 0xC2 &&
+                         uchar(section[1]) == 0xA7);
+}
+
+template <typename Char> FMT_CONSTEXPR auto length(const Char* s) -> size_t {
+  size_t len = 0;
+  while (*s++) ++len;
+  return len;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
+    -> int {
+  for (; n != 0; ++s1, ++s2, --n) {
+    if (*s1 < *s2) return -1;
+    if (*s1 > *s2) return 1;
+  }
+  return 0;
+}
+}  // namespace detail
+
+template <typename Char>
+using basic_string =
+    std::basic_string<Char, std::char_traits<Char>, std::allocator<Char>>;
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+template <typename Char>
+struct is_contiguous<basic_string<Char>> : std::true_type {};
+
+/**
+  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
+  subset of the API. ``fmt::basic_string_view`` is used for format strings even
+  if ``std::string_view`` is available to prevent issues when a library is
+  compiled with a different ``-std`` option than the client code (which is not
+  recommended).
+ */
+FMT_EXPORT
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
+
+  /** Constructs a string reference object from a C string and a size. */
+  constexpr basic_string_view(const Char* s, size_t count) noexcept
+      : data_(s), size_(count) {}
+
+  /**
+    Constructs a string reference object from a C string.
+   */
+  FMT_CONSTEXPR20
+  FMT_INLINE
+  basic_string_view(const Char* s)
+      : data_(s),
+        size_(detail::const_check(std::is_same<Char, char>::value &&
+                                  !detail::is_constant_evaluated(false))
+                  ? std::strlen(reinterpret_cast<const char*>(s))
+                  : detail::length(s)) {}
+
+  /**
+    Constructs a string reference from a ``std::basic_string`` or a
+    ``std::basic_string_view`` object.
+  */
+  template <typename S,
+            FMT_ENABLE_IF(detail::is_string_like<S>::value&& std::is_same<
+                          typename S::value_type, Char>::value)>
+  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  /** Returns a pointer to the string data. */
+  constexpr auto data() const noexcept -> const Char* { return data_; }
+
+  /** Returns the string size. */
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  constexpr auto begin() const noexcept -> iterator { return data_; }
+  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
+
+  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
+    return data_[pos];
+  }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
+    data_ += n;
+    size_ -= n;
+  }
+
+  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
+      -> bool {
+    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
+  }
+  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && *data_ == c;
+  }
+  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
+    return starts_with(basic_string_view<Char>(s));
+  }
+
+  // Lexicographically compare this string reference to other.
+  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
+    size_t str_size = size_ < other.size_ ? size_ : other.size_;
+    int result = detail::compare(data_, other.data_, str_size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
+                                       basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) == 0;
+  }
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) != 0;
+  }
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) < 0;
+  }
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) > 0;
+  }
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+FMT_EXPORT
+using string_view = basic_string_view<char>;
+
+/** Specifies if ``T`` is a character type. Can be specialized by users. */
+FMT_EXPORT
+template <typename T> struct is_char : std::false_type {};
+template <> struct is_char<char> : std::true_type {};
+
+namespace detail {
+
+// A base class for compile-time strings.
+struct compile_string {};
+
+template <typename S>
+struct is_compile_string : std::is_base_of<compile_string, S> {};
+
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename S, FMT_ENABLE_IF(is_string_like<S>::value)>
+inline auto to_string_view(const S& s)
+    -> basic_string_view<typename S::value_type> {
+  return s;  // std::basic_string[_view]
+}
+template <typename Char>
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
+constexpr auto to_string_view(const S& s)
+    -> basic_string_view<typename S::char_type> {
+  return basic_string_view<typename S::char_type>(s);
+}
+void to_string_view(...);
+
+// Specifies whether S is a string type convertible to fmt::basic_string_view.
+// It should be a constexpr function but MSVC 2017 fails to compile it in
+// enable_if and MSVC 2015 fails to compile it as an alias template.
+// ADL is intentionally disabled as to_string_view is not an extension point.
+template <typename S>
+struct is_string
+    : std::is_class<decltype(detail::to_string_view(std::declval<S>()))> {};
+
+template <typename S, typename = void> struct char_t_impl {};
+template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
+  using result = decltype(to_string_view(std::declval<S>()));
+  using type = typename result::value_type;
+};
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_opt, int128_type);
+FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr auto is_integral_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+constexpr auto is_arithmetic_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
+constexpr auto in(type t, int set) -> bool {
+  return ((set >> static_cast<int>(t)) & 1) != 0;
+}
+
+// Bitsets of types.
+enum {
+  sint_set =
+      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
+  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
+             set(type::uint128_type),
+  bool_set = set(type::bool_type),
+  char_set = set(type::char_type),
+  float_set = set(type::float_type) | set(type::double_type) |
+              set(type::long_double_type),
+  string_set = set(type::string_type),
+  cstring_set = set(type::cstring_type),
+  pointer_set = set(type::pointer_type)
+};
+}  // namespace detail
+
+/** Throws ``format_error`` with a given message. */
+FMT_NORETURN FMT_API void throw_format_error(const char* message);
+
+/** String's character type. */
+template <typename S> using char_t = typename detail::char_t_impl<S>::type;
+
+/**
+  \rst
+  Parsing context consisting of a format string range being parsed and an
+  argument counter for automatic indexing.
+  You can use the ``format_parse_context`` type alias for ``char`` instead.
+  \endrst
+ */
+FMT_EXPORT
+template <typename Char> class basic_format_parse_context {
+ private:
+  basic_string_view<Char> format_str_;
+  int next_arg_id_;
+
+  FMT_CONSTEXPR void do_check_arg_id(int id);
+
+ public:
+  using char_type = Char;
+  using iterator = const Char*;
+
+  explicit constexpr basic_format_parse_context(
+      basic_string_view<Char> format_str, int next_arg_id = 0)
+      : format_str_(format_str), next_arg_id_(next_arg_id) {}
+
+  /**
+    Returns an iterator to the beginning of the format string range being
+    parsed.
+   */
+  constexpr auto begin() const noexcept -> iterator {
+    return format_str_.begin();
+  }
+
+  /**
+    Returns an iterator past the end of the format string range being parsed.
+   */
+  constexpr auto end() const noexcept -> iterator { return format_str_.end(); }
+
+  /** Advances the begin iterator to ``it``. */
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /**
+    Reports an error if using the manual argument indexing; otherwise returns
+    the next argument index and switches to the automatic indexing.
+   */
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    if (next_arg_id_ < 0) {
+      throw_format_error(
+          "cannot switch from manual to automatic argument indexing");
+      return 0;
+    }
+    int id = next_arg_id_++;
+    do_check_arg_id(id);
+    return id;
+  }
+
+  /**
+    Reports an error if using the automatic argument indexing; otherwise
+    switches to the manual indexing.
+   */
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    if (next_arg_id_ > 0) {
+      throw_format_error(
+          "cannot switch from automatic to manual argument indexing");
+      return;
+    }
+    next_arg_id_ = -1;
+    do_check_arg_id(id);
+  }
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
+};
+
+FMT_EXPORT
+using format_parse_context = basic_format_parse_context<char>;
+
+namespace detail {
+// A parse context with extra data used only in compile-time checks.
+template <typename Char>
+class compile_parse_context : public basic_format_parse_context<Char> {
+ private:
+  int num_args_;
+  const type* types_;
+  using base = basic_format_parse_context<Char>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str, int num_args, const type* types,
+      int next_arg_id = 0)
+      : base(format_str, next_arg_id), num_args_(num_args), types_(types) {}
+
+  constexpr auto num_args() const -> int { return num_args_; }
+  constexpr auto arg_type(int id) const -> type { return types_[id]; }
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) throw_format_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) throw_format_error("argument not found");
+  }
+  using base::check_arg_id;
+
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
+    detail::ignore_unused(arg_id);
+#if !defined(__LCC__)
+    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
+      throw_format_error("width/precision is not integer");
+#endif
+  }
+};
+
+/**
+  \rst
+  A contiguous memory buffer with an optional growing ability. It is an internal
+  class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`.
+  \endrst
+ */
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+  using grow_fun = void (*)(buffer& buf, size_t capacity);
+  grow_fun grow_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_MSC_WARNING(suppress : 26495)
+  FMT_CONSTEXPR buffer(grow_fun grow, size_t sz) noexcept
+      : size_(sz), capacity_(sz), grow_(grow) {}
+
+  FMT_CONSTEXPR20 buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
+                         size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
+
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
+
+  /** Sets the buffer data and capacity. */
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  FMT_INLINE auto begin() noexcept -> T* { return ptr_; }
+  FMT_INLINE auto end() noexcept -> T* { return ptr_ + size_; }
+
+  FMT_INLINE auto begin() const noexcept -> const T* { return ptr_; }
+  FMT_INLINE auto end() const noexcept -> const T* { return ptr_ + size_; }
+
+  /** Returns the size of this buffer. */
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  /** Returns the capacity of this buffer. */
+  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
+
+  /** Returns a pointer to the buffer data (not null-terminated). */
+  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
+  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
+
+  /** Clears this buffer. */
+  void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain *count* elements. If T is a POD type
+  // the new elements may not be initialized.
+  FMT_CONSTEXPR20 void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to *new_capacity*. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow_(*this, new_capacity);
+  }
+
+  FMT_CONSTEXPR20 void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /** Appends data to the end of the buffer. */
+  template <typename U> void append(const U* begin, const U* end);
+
+  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
+    return ptr_[index];
+  }
+  template <typename Idx>
+  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  auto count() const -> size_t { return 0; }
+  auto limit(size_t size) -> size_t { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  auto count() const -> size_t { return count_; }
+  auto limit(size_t size) -> size_t {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer final : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    const T* begin = data_;
+    const T* end = begin + this->limit(size);
+    while (begin != end) *out_++ = *begin++;
+  }
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : Traits(other),
+        buffer<T>(grow, data_, 0, buffer_size),
+        out_(other.out_) {}
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> OutputIt {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> final
+    : public fixed_buffer_traits,
+      public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buf.capacity())
+      static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : fixed_buffer_traits(other),
+        buffer<T>(static_cast<iterator_buffer&&>(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
+};
+
+template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
+ public:
+  explicit iterator_buffer(T* out, size_t = 0)
+      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
+
+  auto out() -> T* { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename Container>
+class iterator_buffer<back_insert_iterator<Container>,
+                      enable_if_t<is_contiguous<Container>::value,
+                                  typename Container::value_type>>
+    final : public buffer<typename Container::value_type> {
+ private:
+  using value_type = typename Container::value_type;
+  Container& container_;
+
+  static FMT_CONSTEXPR20 void grow(buffer<value_type>& buf, size_t capacity) {
+    auto& self = static_cast<iterator_buffer&>(buf);
+    self.container_.resize(capacity);
+    self.set(&self.container_[0], capacity);
+  }
+
+ public:
+  explicit iterator_buffer(Container& c)
+      : buffer<value_type>(grow, c.size()), container_(c) {}
+  explicit iterator_buffer(back_insert_iterator<Container> out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+
+  auto out() -> back_insert_iterator<Container> {
+    return fmt::back_inserter(container_);
+  }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer final : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() != buffer_size) return;
+    static_cast<counting_buffer&>(buf).count_ += buf.size();
+    buf.clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
+
+  auto count() -> size_t { return count_ + this->size(); }
+};
+}  // namespace detail
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
+  // Argument id is only checked at compile-time during parsing because
+  // formatting has its own validation.
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    if (id >= static_cast<context*>(this)->num_args())
+      throw_format_error("argument not found");
+  }
+}
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::check_dynamic_spec(
+    int arg_id) {
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    static_cast<context*>(this)->check_dynamic_spec(arg_id);
+  }
+}
+
+FMT_EXPORT template <typename Context> class basic_format_arg;
+FMT_EXPORT template <typename Context> class basic_format_args;
+FMT_EXPORT template <typename Context> class dynamic_format_arg_store;
+
+// A formatter for objects of type T.
+FMT_EXPORT
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+// Specifies if T has an enabled formatter specialization. A type can be
+// formattable even if it doesn't have a formatter e.g. via a conversion.
+template <typename T, typename Context>
+using has_formatter =
+    std::is_constructible<typename Context::template formatter_type<T>>;
+
+// An output iterator that appends to a buffer. It is used instead of
+// back_insert_iterator to reduce symbol sizes for the common case.
+class appender {
+ private:
+  detail::buffer<char>* buffer_;
+
+  friend auto get_container(appender app) -> detail::buffer<char>& {
+    return *app.buffer_;
+  }
+
+ public:
+  using difference_type = ptrdiff_t;
+  FMT_UNCHECKED_ITERATOR(appender);
+
+  appender(detail::buffer<char>& buf) : buffer_(&buf) {}
+
+  auto operator=(char c) -> appender& {
+    buffer_->push_back(c);
+    return *this;
+  }
+  auto operator*() -> appender& { return *this; }
+  auto operator++() -> appender& { return *this; }
+  auto operator++(int) -> appender { return *this; }
+};
+
+namespace detail {
+
+template <typename Context, typename T>
+constexpr auto has_const_formatter_impl(T*)
+    -> decltype(typename Context::template formatter_type<T>().format(
+                    std::declval<const T&>(), std::declval<Context&>()),
+                true) {
+  return true;
+}
+template <typename Context>
+constexpr auto has_const_formatter_impl(...) -> bool {
+  return false;
+}
+template <typename T, typename Context>
+constexpr auto has_const_formatter() -> bool {
+  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
+}
+
+template <typename T>
+using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
+                                      back_insert_iterator<buffer<T>>>;
+
+// Maps an output iterator to a buffer.
+template <typename T, typename OutputIt>
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
+}
+template <typename T, typename Buf,
+          FMT_ENABLE_IF(std::is_base_of<buffer<char>, Buf>::value)>
+auto get_buffer(back_insert_iterator<Buf> out) -> buffer<char>& {
+  return get_container(out);
+}
+
+template <typename Buf, typename OutputIt>
+FMT_INLINE auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T, typename OutputIt>
+auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
+  return out;
+}
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+};
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+struct arg_data {
+  // args_[0].named_args points to named_args_ to avoid bloating format_args.
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
+  named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
+
+  template <typename... U>
+  arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {}
+  arg_data(const arg_data& other) = delete;
+  auto args() const -> const T* { return args_ + 1; }
+  auto named_args() -> named_arg_info<Char>* { return named_args_; }
+};
+
+template <typename T, typename Char, size_t NUM_ARGS>
+struct arg_data<T, Char, NUM_ARGS, 0> {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
+
+  template <typename... U>
+  FMT_CONSTEXPR FMT_INLINE arg_data(const U&... init) : args_{init...} {}
+  FMT_CONSTEXPR FMT_INLINE auto args() const -> const T* { return args_; }
+  FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t {
+    return nullptr;
+  }
+};
+
+template <typename Char>
+inline void init_named_args(named_arg_info<Char>*, int, int) {}
+
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_statically_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(!is_named_arg<T>::value)>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const T&, const Tail&... args) {
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(is_named_arg<T>::value)>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const T& arg, const Tail&... args) {
+  named_args[named_arg_count++] = {arg.name, arg_count};
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename... Args>
+FMT_CONSTEXPR FMT_INLINE void init_named_args(std::nullptr_t, int, int,
+                                              const Args&...) {}
+
+template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr auto count_named_args() -> size_t {
+  return count<is_named_arg<Args>::value...>();
+}
+
+template <typename... Args>
+constexpr auto count_statically_named_args() -> size_t {
+  return count<is_statically_named_arg<Args>::value...>();
+}
+
+struct unformattable {};
+struct unformattable_char : unformattable {};
+struct unformattable_pointer : unformattable {};
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+template <typename Context> struct custom_value {
+  using parse_context = typename Context::parse_context_type;
+  void* value;
+  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
+};
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    monostate no_value;
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_opt int128_value;
+    uint128_opt uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_INLINE value() : no_value() {}
+  constexpr FMT_INLINE value(int val) : int_value(val) {}
+  constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
+  constexpr FMT_INLINE value(long long val) : long_long_value(val) {}
+  constexpr FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
+  FMT_INLINE value(int128_opt val) : int128_value(val) {}
+  FMT_INLINE value(uint128_opt val) : uint128_value(val) {}
+  constexpr FMT_INLINE value(float val) : float_value(val) {}
+  constexpr FMT_INLINE value(double val) : double_value(val) {}
+  FMT_INLINE value(long double val) : long_double_value(val) {}
+  constexpr FMT_INLINE value(bool val) : bool_value(val) {}
+  constexpr FMT_INLINE value(char_type val) : char_value(val) {}
+  FMT_CONSTEXPR FMT_INLINE value(const char_type* val) {
+    string.data = val;
+    if (is_constant_evaluated()) string.size = {};
+  }
+  FMT_CONSTEXPR FMT_INLINE value(basic_string_view<char_type> val) {
+    string.data = val.data();
+    string.size = val.size();
+  }
+  FMT_INLINE value(const void* val) : pointer(val) {}
+  FMT_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+  template <typename T> FMT_CONSTEXPR20 FMT_INLINE value(T& val) {
+    using value_type = remove_const_t<T>;
+    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
+#ifdef __cpp_if_constexpr
+    if constexpr (std::is_same<decltype(&val), T*>::value)
+      custom.value = const_cast<value_type*>(&val);
+#endif
+    if (!is_constant_evaluated())
+      custom.value = const_cast<char*>(&reinterpret_cast<const char&>(val));
+    // Get the formatter type through the context to allow different contexts
+    // have different extension points, e.g. `formatter<T>` for `format` and
+    // `printf_formatter<T>` for `printf`.
+    custom.format = format_custom_arg<
+        value_type, typename Context::template formatter_type<value_type>>;
+  }
+  value(unformattable);
+  value(unformattable_char);
+  value(unformattable_pointer);
+
+ private:
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom_arg(void* arg,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) {
+    auto f = Formatter();
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    using qualified_type =
+        conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    // Calling format through a mutable reference is deprecated.
+    ctx.advance_to(f.format(*static_cast<qualified_type*>(arg), ctx));
+  }
+};
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+template <typename T> struct format_as_result {
+  template <typename U,
+            FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
+  static auto map(U*) -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
+  static auto map(...) -> void;
+
+  using type = decltype(map(static_cast<T*>(nullptr)));
+};
+template <typename T> using format_as_t = typename format_as_result<T>::type;
+
+template <typename T>
+struct has_format_as
+    : bool_constant<!std::is_same<format_as_t<T>, void>::value> {};
+
+// Maps formatting arguments to core types.
+// arg_mapper reports errors by returning unformattable instead of using
+// static_assert because it's used in the is_formattable trait.
+template <typename Context> struct arg_mapper {
+  using char_type = typename Context::char_type;
+
+  FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val)
+      -> unsigned long long {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int128_opt val) -> int128_opt {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(uint128_opt val) -> uint128_opt {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
+                                      std::is_same<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type {
+    return val;
+  }
+  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
+#ifdef __cpp_char8_t
+                                     std::is_same<T, char8_t>::value ||
+#endif
+                                     std::is_same<T, char16_t>::value ||
+                                     std::is_same<T, char32_t>::value) &&
+                                        !std::is_same<T, char_type>::value,
+                                    int> = 0>
+  FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char {
+    return {};
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double {
+    return val;
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(char_type* val) -> const char_type* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(const char_type* val) -> const char_type* {
+    return val;
+  }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> basic_string_view<char_type> {
+    return to_string_view(val);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          !std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T&) -> unformattable_char {
+    return {};
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(void* val) -> const void* { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(const void* val) -> const void* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void* {
+    return val;
+  }
+
+  // Use SFINAE instead of a const T* parameter to avoid a conflict with the
+  // array overload.
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_pointer<T>::value || std::is_member_pointer<T>::value ||
+          std::is_function<typename std::remove_pointer<T>::type>::value ||
+          (std::is_array<T>::value &&
+           !std::is_convertible<T, const char_type*>::value))>
+  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
+    return {};
+  }
+
+  template <typename T, std::size_t N,
+            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] {
+    return values;
+  }
+
+  // Only map owning types because mapping views can be unsafe.
+  template <typename T, typename U = format_as_t<T>,
+            FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> decltype(FMT_DECLTYPE_THIS map(U())) {
+    return map(format_as(val));
+  }
+
+  template <typename T, typename U = remove_const_t<T>>
+  struct formattable : bool_constant<has_const_formatter<U, Context>() ||
+                                     (has_formatter<U, Context>::value &&
+                                      !std::is_const<T>::value)> {};
+
+  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T& val) -> T& {
+    return val;
+  }
+  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T&) -> unformattable {
+    return {};
+  }
+
+  template <typename T, typename U = remove_const_t<T>,
+            FMT_ENABLE_IF((std::is_class<U>::value || std::is_enum<U>::value ||
+                           std::is_union<U>::value) &&
+                          !is_string<U>::value && !is_char<U>::value &&
+                          !is_named_arg<U>::value &&
+                          !std::is_arithmetic<format_as_t<U>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(T& val)
+      -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
+    return do_map(val);
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg)
+      -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
+    return map(named_arg.value);
+  }
+
+  auto map(...) -> unformattable { return {}; }
+};
+
+// A type constant after applying arg_mapper<Context>.
+template <typename T, typename Context>
+using mapped_type_constant =
+    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
+                  typename Context::char_type>;
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <> struct is_output_iterator<appender, char> : std::true_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T, void_t<decltype(*std::declval<It&>()++ = std::declval<T>())>>
+    : std::true_type {};
+
+template <typename It> struct is_back_insert_iterator : std::false_type {};
+template <typename Container>
+struct is_back_insert_iterator<back_insert_iterator<Container>>
+    : std::true_type {};
+
+// A type-erased reference to an std::locale to avoid a heavy <locale> include.
+class locale_ref {
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  constexpr FMT_INLINE locale_ref() : locale_(nullptr) {}
+  template <typename Locale> explicit locale_ref(const Locale& loc);
+
+  explicit operator bool() const noexcept { return locale_ != nullptr; }
+
+  template <typename Locale> auto get() const -> Locale;
+};
+
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
+
+template <typename Context, typename Arg, typename... Args>
+constexpr auto encode_types() -> unsigned long long {
+  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+#if defined(__cpp_if_constexpr)
+// This type is intentionally undefined, only used for errors
+template <typename T, typename Char> struct type_is_unformattable_for;
+#endif
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(PACKED)>
+FMT_CONSTEXPR FMT_INLINE auto make_arg(T& val) -> value<Context> {
+  using arg_type = remove_cvref_t<decltype(arg_mapper<Context>().map(val))>;
+
+  constexpr bool formattable_char =
+      !std::is_same<arg_type, unformattable_char>::value;
+  static_assert(formattable_char, "Mixing character types is disallowed.");
+
+  // Formatting of arbitrary pointers is disallowed. If you want to format a
+  // pointer cast it to `void*` or `const void*`. In particular, this forbids
+  // formatting of `[const] volatile char*` printed as bool by iostreams.
+  constexpr bool formattable_pointer =
+      !std::is_same<arg_type, unformattable_pointer>::value;
+  static_assert(formattable_pointer,
+                "Formatting of non-void pointers is disallowed.");
+
+  constexpr bool formattable = !std::is_same<arg_type, unformattable>::value;
+#if defined(__cpp_if_constexpr)
+  if constexpr (!formattable) {
+    type_is_unformattable_for<T, typename Context::char_type> _;
+  }
+#endif
+  static_assert(
+      formattable,
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return {arg_mapper<Context>().map(val)};
+}
+
+template <typename Context, typename T>
+FMT_CONSTEXPR auto make_arg(T& val) -> basic_format_arg<Context> {
+  auto arg = basic_format_arg<Context>();
+  arg.type_ = mapped_type_constant<T, Context>::value;
+  arg.value_ = make_arg<true, Context>(val);
+  return arg;
+}
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(!PACKED)>
+FMT_CONSTEXPR inline auto make_arg(T& val) -> basic_format_arg<Context> {
+  return make_arg<Context>(val);
+}
+}  // namespace detail
+FMT_BEGIN_EXPORT
+
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  template <typename ContextType, typename T>
+  friend FMT_CONSTEXPR auto detail::make_arg(T& value)
+      -> basic_format_arg<ContextType>;
+
+  friend class basic_format_args<Context>;
+  friend class dynamic_format_arg_store<Context>;
+
+  using char_type = typename Context::char_type;
+
+  template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+  friend struct detail::arg_data;
+
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+
+ public:
+  class handle {
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(typename Context::parse_context_type& parse_ctx,
+                Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+
+   private:
+    detail::custom_value<Context> custom_;
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+
+  constexpr explicit operator bool() const noexcept {
+    return type_ != detail::type::none_type;
+  }
+
+  auto type() const -> detail::type { return type_; }
+
+  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
+  auto is_arithmetic() const -> bool {
+    return detail::is_arithmetic_type(type_);
+  }
+
+  /**
+    \rst
+    Visits an argument dispatching to the appropriate visit method based on
+    the argument type. For example, if the argument type is ``double`` then
+    ``vis(value)`` will be called with the value of type ``double``.
+    \endrst
+  */
+  template <typename Visitor>
+  FMT_CONSTEXPR auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    switch (type_) {
+    case detail::type::none_type:
+      break;
+    case detail::type::int_type:
+      return vis(value_.int_value);
+    case detail::type::uint_type:
+      return vis(value_.uint_value);
+    case detail::type::long_long_type:
+      return vis(value_.long_long_value);
+    case detail::type::ulong_long_type:
+      return vis(value_.ulong_long_value);
+    case detail::type::int128_type:
+      return vis(detail::convert_for_visit(value_.int128_value));
+    case detail::type::uint128_type:
+      return vis(detail::convert_for_visit(value_.uint128_value));
+    case detail::type::bool_type:
+      return vis(value_.bool_value);
+    case detail::type::char_type:
+      return vis(value_.char_value);
+    case detail::type::float_type:
+      return vis(value_.float_value);
+    case detail::type::double_type:
+      return vis(value_.double_value);
+    case detail::type::long_double_type:
+      return vis(value_.long_double_value);
+    case detail::type::cstring_type:
+      return vis(value_.string.data);
+    case detail::type::string_type:
+      using sv = basic_string_view<typename Context::char_type>;
+      return vis(sv(value_.string.data, value_.string.size));
+    case detail::type::pointer_type:
+      return vis(value_.pointer);
+    case detail::type::custom_type:
+      return vis(typename basic_format_arg<Context>::handle(value_.custom));
+    }
+    return vis(monostate());
+  }
+
+  FMT_INLINE auto format_custom(const char_type* parse_begin,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
+};
+
+template <typename Visitor, typename Context>
+FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
+    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
+  return arg.visit(std::forward<Visitor>(vis));
+}
+
+// Formatting context.
+template <typename OutputIt, typename Char> class basic_format_context {
+ private:
+  OutputIt out_;
+  basic_format_args<basic_format_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using iterator = OutputIt;
+  using format_arg = basic_format_arg<basic_format_context>;
+  using format_args = basic_format_args<basic_format_context>;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = formatter<T, Char>;
+
+  /** The character type for the output. */
+  using char_type = Char;
+
+  basic_format_context(basic_format_context&&) = default;
+  basic_format_context(const basic_format_context&) = delete;
+  void operator=(const basic_format_context&) = delete;
+  /**
+    Constructs a ``basic_format_context`` object. References to the arguments
+    are stored in the object so make sure they have appropriate lifetimes.
+   */
+  constexpr basic_format_context(OutputIt out, format_args ctx_args,
+                                 detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+
+  constexpr auto arg(int id) const -> format_arg { return args_.get(id); }
+  FMT_CONSTEXPR auto arg(basic_string_view<Char> name) -> format_arg {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(basic_string_view<Char> name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const format_args& { return args_; }
+
+  // This function is intentionally not constexpr to give a compile-time error.
+  void on_error(const char* message) { throw_format_error(message); }
+
+  // Returns an iterator to the beginning of the output range.
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  // Advances the begin iterator to ``it``.
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
+template <typename Char>
+using buffer_context =
+    basic_format_context<detail::buffer_appender<Char>, Char>;
+using format_context = buffer_context<char>;
+
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<!std::is_base_of<
+    detail::unformattable, decltype(detail::arg_mapper<buffer_context<Char>>()
+                                        .map(std::declval<T&>()))>::value>;
+
+/**
+  \rst
+  An array of references to arguments. It can be implicitly converted into
+  `~fmt::basic_format_args` for passing into type-erased formatting functions
+  such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context, typename... Args>
+class format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  static const size_t num_args = sizeof...(Args);
+  static constexpr size_t num_named_args = detail::count_named_args<Args...>();
+  static const bool is_packed = num_args <= detail::max_packed_args;
+
+  using value_type = conditional_t<is_packed, detail::value<Context>,
+                                   basic_format_arg<Context>>;
+
+  detail::arg_data<value_type, typename Context::char_type, num_args,
+                   num_named_args>
+      data_;
+
+  friend class basic_format_args<Context>;
+
+  static constexpr unsigned long long desc =
+      (is_packed ? detail::encode_types<Context, Args...>()
+                 : detail::is_unpacked_bit | num_args) |
+      (num_named_args != 0
+           ? static_cast<unsigned long long>(detail::has_named_args_bit)
+           : 0);
+
+ public:
+  template <typename... T>
+  FMT_CONSTEXPR FMT_INLINE format_arg_store(T&... args)
+      :
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+        basic_format_args<Context>(*this),
+#endif
+        data_{detail::make_arg<is_packed, Context>(args)...} {
+    if (detail::const_check(num_named_args != 0))
+      detail::init_named_args(data_.named_args(), 0, 0, args...);
+  }
+};
+
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references to
+  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
+  can be omitted in which case it defaults to `~fmt::format_context`.
+  See `~fmt::arg` for lifetime considerations.
+  \endrst
+ */
+// Arguments are taken by lvalue references to avoid some lifetime issues.
+template <typename Context = format_context, typename... T>
+constexpr auto make_format_args(T&... args)
+    -> format_arg_store<Context, remove_const_t<T>...> {
+  return {args...};
+}
+
+/**
+  \rst
+  Returns a named argument to be used in a formatting function.
+  It should only be used in a call to a formatting function or
+  `dynamic_format_arg_store::push_back`.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
+  \endrst
+ */
+template <typename Char, typename T>
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
+  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
+  return {name, arg};
+}
+FMT_END_EXPORT
+
+/**
+  \rst
+  A view of a collection of formatting arguments. To avoid lifetime issues it
+  should only be used as a parameter type in type-erased functions such as
+  ``vformat``::
+
+    void vlog(string_view format_str, format_args args);  // OK
+    format_args args = make_format_args();  // Error: dangling reference
+  \endrst
+ */
+template <typename Context> class basic_format_args {
+ public:
+  using size_type = int;
+  using format_arg = basic_format_arg<Context>;
+
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const format_arg* args_;
+  };
+
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  auto has_named_args() const -> bool {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
+    int shift = index * detail::packed_arg_bits;
+    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+  constexpr FMT_INLINE basic_format_args(unsigned long long desc,
+                                         const detail::value<Context>* values)
+      : desc_(desc), values_(values) {}
+  constexpr basic_format_args(unsigned long long desc, const format_arg* args)
+      : desc_(desc), args_(args) {}
+
+ public:
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
+   \endrst
+   */
+  template <typename... Args>
+  constexpr FMT_INLINE basic_format_args(
+      const format_arg_store<Context, Args...>& store)
+      : basic_format_args(format_arg_store<Context, Args...>::desc,
+                          store.data_.args()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from
+   `~fmt::dynamic_format_arg_store`.
+   \endrst
+   */
+  constexpr FMT_INLINE basic_format_args(
+      const dynamic_format_arg_store<Context>& store)
+      : basic_format_args(store.get_types(), store.data()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from a dynamic set of arguments.
+   \endrst
+   */
+  constexpr basic_format_args(const format_arg* args, int count)
+      : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count),
+                          args) {}
+
+  /** Returns the argument with the specified id. */
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
+    format_arg arg;
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (id >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ == detail::type::none_type) return arg;
+    arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char>
+  auto get_id(basic_string_view<Char> name) const -> int {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  auto max_size() const -> int {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+/** An alias to ``basic_format_args<format_context>``. */
+// A separate type would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+FMT_EXPORT using format_args = basic_format_args<format_context>;
+
+// We cannot use enum classes as bit fields because of a gcc bug, so we put them
+// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
+// Additionally, if an underlying type is specified, older gcc incorrectly warns
+// that the type is too small. Both bugs are fixed in gcc 9.3.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
+#  define FMT_ENUM_UNDERLYING_TYPE(type)
+#else
+#  define FMT_ENUM_UNDERLYING_TYPE(type) : type
+#endif
+namespace align {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center,
+                                                  numeric};
+}
+using align_t = align::type;
+namespace sign {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space};
+}
+using sign_t = sign::type;
+
+namespace detail {
+
+// Workaround an array initialization issue in gcc 4.8.
+template <typename Char> struct fill_t {
+ private:
+  enum { max_size = 4 };
+  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
+  unsigned char size_ = 1;
+
+ public:
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    FMT_ASSERT(size <= max_size, "invalid fill");
+    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
+    size_ = static_cast<unsigned char>(size);
+  }
+
+  constexpr auto size() const -> size_t { return size_; }
+  constexpr auto data() const -> const Char* { return data_; }
+
+  FMT_CONSTEXPR auto operator[](size_t index) -> Char& { return data_[index]; }
+  FMT_CONSTEXPR auto operator[](size_t index) const -> const Char& {
+    return data_[index];
+  }
+};
+}  // namespace detail
+
+enum class presentation_type : unsigned char {
+  none,
+  dec,             // 'd'
+  oct,             // 'o'
+  hex_lower,       // 'x'
+  hex_upper,       // 'X'
+  bin_lower,       // 'b'
+  bin_upper,       // 'B'
+  hexfloat_lower,  // 'a'
+  hexfloat_upper,  // 'A'
+  exp_lower,       // 'e'
+  exp_upper,       // 'E'
+  fixed_lower,     // 'f'
+  fixed_upper,     // 'F'
+  general_lower,   // 'g'
+  general_upper,   // 'G'
+  chr,             // 'c'
+  string,          // 's'
+  pointer,         // 'p'
+  debug            // '?'
+};
+
+// Format specifiers for built-in and string types.
+template <typename Char = char> struct format_specs {
+  int width;
+  int precision;
+  presentation_type type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool alt : 1;  // Alternate form ('#').
+  bool localized : 1;
+  detail::fill_t<Char> fill;
+
+  constexpr format_specs()
+      : width(0),
+        precision(-1),
+        type(presentation_type::none),
+        align(align::none),
+        sign(sign::none),
+        alt(false),
+        localized(false) {}
+};
+
+namespace detail {
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int idx = 0) : index(idx) {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow reusing the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char = char>
+struct dynamic_format_specs : format_specs<Char> {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Converts a character to ASCII. Returns '\0' on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value ||
+                                       std::is_enum<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+
+// Returns the number of code units in a code point or 1 on error.
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto c = static_cast<unsigned char>(*begin);
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 0x3) + 1;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline auto find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) -> bool {
+  out = static_cast<const char*>(
+      std::memchr(first, value, to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
+  if (num_digits <= digits10) return static_cast<int>(value);
+  // Check for overflow.
+  unsigned max = INT_MAX;
+  return num_digits == digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+FMT_CONSTEXPR inline auto parse_align(char c) -> align_t {
+  switch (c) {
+  case '<':
+    return align::left;
+  case '>':
+    return align::right;
+  case '^':
+    return align::center;
+  }
+  return align::none;
+}
+
+template <typename Char> constexpr auto is_name_start(Char c) -> bool {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
+                                   Handler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, INT_MAX);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      throw_format_error("invalid format string");
+    else
+      handler.on_index(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    throw_format_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
+  handler.on_name({begin, to_unsigned(it - begin)});
+  return it;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
+  handler.on_auto();
+  return begin;
+}
+
+template <typename Char> struct dynamic_spec_id_handler {
+  basic_format_parse_context<Char>& ctx;
+  arg_ref<Char>& ref;
+
+  FMT_CONSTEXPR void on_auto() {
+    int id = ctx.next_arg_id();
+    ref = arg_ref<Char>(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_index(int id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+  }
+};
+
+// Parses [integer | "{" [arg_id] "}"].
+template <typename Char>
+FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
+                                      int& value, arg_ref<Char>& ref,
+                                      basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    int val = parse_nonnegative_int(begin, end, -1);
+    if (val != -1)
+      value = val;
+    else
+      throw_format_error("number is too big");
+  } else if (*begin == '{') {
+    ++begin;
+    auto handler = dynamic_spec_id_handler<Char>{ctx, ref};
+    if (begin != end) begin = parse_arg_id(begin, end, handler);
+    if (begin != end && *begin == '}') return ++begin;
+    throw_format_error("invalid format string");
+  }
+  return begin;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   int& value, arg_ref<Char>& ref,
+                                   basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  ++begin;
+  if (begin == end || *begin == '}') {
+    throw_format_error("invalid precision");
+    return begin;
+  }
+  return parse_dynamic_spec(begin, end, value, ref, ctx);
+}
+
+enum class state { start, align, sign, hash, zero, width, precision, locale };
+
+// Parses standard format specifiers.
+template <typename Char>
+FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
+    const Char* begin, const Char* end, dynamic_format_specs<Char>& specs,
+    basic_format_parse_context<Char>& ctx, type arg_type) -> const Char* {
+  auto c = '\0';
+  if (end - begin > 1) {
+    auto next = to_ascii(begin[1]);
+    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
+  } else {
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+
+  struct {
+    state current_state = state::start;
+    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
+      if (current_state >= s || !valid)
+        throw_format_error("invalid format specifier");
+      current_state = s;
+    }
+  } enter_state;
+
+  using pres = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  struct {
+    const Char*& begin;
+    dynamic_format_specs<Char>& specs;
+    type arg_type;
+
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) {
+        if (arg_type == type::none_type) return begin;
+        throw_format_error("invalid format specifier");
+      }
+      specs.type = pres_type;
+      return begin + 1;
+    }
+  } parse_presentation_type{begin, specs, arg_type};
+
+  for (;;) {
+    switch (c) {
+    case '<':
+    case '>':
+    case '^':
+      enter_state(state::align);
+      specs.align = parse_align(c);
+      ++begin;
+      break;
+    case '+':
+    case '-':
+    case ' ':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::sign, in(arg_type, sint_set | float_set));
+      switch (c) {
+      case '+':
+        specs.sign = sign::plus;
+        break;
+      case '-':
+        specs.sign = sign::minus;
+        break;
+      case ' ':
+        specs.sign = sign::space;
+        break;
+      }
+      ++begin;
+      break;
+    case '#':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::hash, is_arithmetic_type(arg_type));
+      specs.alt = true;
+      ++begin;
+      break;
+    case '0':
+      enter_state(state::zero);
+      if (!is_arithmetic_type(arg_type)) {
+        if (arg_type == type::none_type) return begin;
+        throw_format_error("format specifier requires numeric argument");
+      }
+      if (specs.align == align::none) {
+        // Ignore 0 if align is specified for compatibility with std::format.
+        specs.align = align::numeric;
+        specs.fill[0] = Char('0');
+      }
+      ++begin;
+      break;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '{':
+      enter_state(state::width);
+      begin = parse_dynamic_spec(begin, end, specs.width, specs.width_ref, ctx);
+      break;
+    case '.':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::precision,
+                  in(arg_type, float_set | string_set | cstring_set));
+      begin = parse_precision(begin, end, specs.precision, specs.precision_ref,
+                              ctx);
+      break;
+    case 'L':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::locale, is_arithmetic_type(arg_type));
+      specs.localized = true;
+      ++begin;
+      break;
+    case 'd':
+      return parse_presentation_type(pres::dec, integral_set);
+    case 'o':
+      return parse_presentation_type(pres::oct, integral_set);
+    case 'x':
+      return parse_presentation_type(pres::hex_lower, integral_set);
+    case 'X':
+      return parse_presentation_type(pres::hex_upper, integral_set);
+    case 'b':
+      return parse_presentation_type(pres::bin_lower, integral_set);
+    case 'B':
+      return parse_presentation_type(pres::bin_upper, integral_set);
+    case 'a':
+      return parse_presentation_type(pres::hexfloat_lower, float_set);
+    case 'A':
+      return parse_presentation_type(pres::hexfloat_upper, float_set);
+    case 'e':
+      return parse_presentation_type(pres::exp_lower, float_set);
+    case 'E':
+      return parse_presentation_type(pres::exp_upper, float_set);
+    case 'f':
+      return parse_presentation_type(pres::fixed_lower, float_set);
+    case 'F':
+      return parse_presentation_type(pres::fixed_upper, float_set);
+    case 'g':
+      return parse_presentation_type(pres::general_lower, float_set);
+    case 'G':
+      return parse_presentation_type(pres::general_upper, float_set);
+    case 'c':
+      if (arg_type == type::bool_type)
+        throw_format_error("invalid format specifier");
+      return parse_presentation_type(pres::chr, integral_set);
+    case 's':
+      return parse_presentation_type(pres::string,
+                                     bool_set | string_set | cstring_set);
+    case 'p':
+      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
+    case '?':
+      return parse_presentation_type(pres::debug,
+                                     char_set | string_set | cstring_set);
+    case '}':
+      return begin;
+    default: {
+      if (*begin == '}') return begin;
+      // Parse fill and alignment.
+      auto fill_end = begin + code_point_length(begin);
+      if (end - fill_end <= 0) {
+        throw_format_error("invalid format specifier");
+        return begin;
+      }
+      if (*begin == '{') {
+        throw_format_error("invalid fill character '{'");
+        return begin;
+      }
+      auto align = parse_align(to_ascii(*fill_end));
+      enter_state(state::align, align != align::none);
+      specs.fill = {begin, to_unsigned(fill_end - begin)};
+      specs.align = align;
+      begin = fill_end + 1;
+    }
+    }
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  struct id_adapter {
+    Handler& handler;
+    int arg_id;
+
+    FMT_CONSTEXPR void on_auto() { arg_id = handler.on_arg_id(); }
+    FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
+    FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+      arg_id = handler.on_arg_id(id);
+    }
+  };
+
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE void parse_format_string(
+    basic_string_view<Char> format_str, Handler&& handler) {
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct writer {
+    FMT_CONSTEXPR void operator()(const Char* from, const Char* to) {
+      if (from == to) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
+          return handler_.on_text(from, to);
+        ++p;
+        if (p == to || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(from, p);
+        from = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write = {handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, bool = is_named_arg<T>::value> struct strip_named_arg {
+  using type = T;
+};
+template <typename T> struct strip_named_arg<T, true> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename ParseContext>
+FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
+    -> decltype(ctx.begin()) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffer_context<char_type>;
+  using mapped_type = conditional_t<
+      mapped_type_constant<T, context>::value != type::custom_type,
+      decltype(arg_mapper<context>().map(std::declval<const T&>())),
+      typename strip_named_arg<T>::type>;
+// LAMMPS customization. Fails to compile with (some) Intel compilers
+#if defined(__cpp_if_constexpr) && 1
+  if constexpr (std::is_default_constructible<
+                    formatter<mapped_type, char_type>>::value) {
+    return formatter<mapped_type, char_type>().parse(ctx);
+  } else {
+    type_is_unformattable_for<T, char_type> _;
+    return ctx.begin();
+  }
+#else
+  return formatter<mapped_type, char_type>().parse(ctx);
+#endif
+}
+
+// Checks char specs and returns true iff the presentation type is char-like.
+template <typename Char>
+FMT_CONSTEXPR auto check_char_specs(const format_specs<Char>& specs) -> bool {
+  if (specs.type != presentation_type::none &&
+      specs.type != presentation_type::chr &&
+      specs.type != presentation_type::debug) {
+    return false;
+  }
+  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
+    throw_format_error("invalid format specifier for char");
+  return true;
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <int N, typename T, typename... Args, typename Char>
+constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+  if constexpr (is_statically_named_arg<T>()) {
+    if (name == T::name) return N;
+  }
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<N + 1, Args...>(name);
+  (void)name;  // Workaround an MSVC bug about "unused" parameter.
+  return -1;
+}
+#endif
+
+template <typename... Args, typename Char>
+FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<0, Args...>(name);
+#endif
+  (void)name;
+  return -1;
+}
+
+template <typename Char, typename... Args> class format_string_checker {
+ private:
+  using parse_context_type = compile_parse_context<Char>;
+  static constexpr int num_args = sizeof...(Args);
+
+  // Format specifier parsing function.
+  // In the future basic_format_parse_context will replace compile_parse_context
+  // here and will use is_constant_evaluated and downcasting to access the data
+  // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(basic_string_view<Char> fmt)
+      : types_{mapped_type_constant<Args, buffer_context<Char>>::value...},
+        context_(fmt, num_args, types_),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    return context_.check_arg_id(id), id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+    auto index = get_arg_index_by_name<Args...>(id);
+    if (index < 0) on_error("named argument is not found");
+    return index;
+#else
+    (void)id;
+    on_error("compile-time checks for named arguments require C++20 support");
+    return 0;
+#endif
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
+    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
+  }
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
+      -> const Char* {
+    context_.advance_to(begin);
+    // id >= 0 check is a workaround for gcc 10 bug (#2065).
+    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    throw_format_error(message);
+  }
+};
+
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S format_str) {
+  using char_t = typename S::char_type;
+  FMT_CONSTEXPR auto s = basic_string_view<char_t>(format_str);
+  using checker = format_string_checker<char_t, remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR bool error = (parse_format_string<true>(s, checker(s)), true);
+  ignore_unused(error);
+}
+
+template <typename Char = char> struct vformat_args {
+  using type = basic_format_args<
+      basic_format_context<back_insert_iterator<buffer<Char>>, Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
+};
+
+// Use vformat_args and avoid type_identity to keep symbols short.
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc = {});
+
+FMT_API void vprint_mojibake(std::FILE*, string_view, format_args);
+#ifndef _WIN32
+inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
+#endif
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// A formatter specialization for natively supported types.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>> {
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    auto type = detail::type_constant<T, Char>::value;
+    auto end =
+        detail::parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, type);
+    if (type == detail::type::char_type) detail::check_char_specs(specs_);
+    return end;
+  }
+
+  template <detail::type U = detail::type_constant<T, Char>::value,
+            FMT_ENABLE_IF(U == detail::type::string_type ||
+                          U == detail::type::cstring_type ||
+                          U == detail::type::char_type)>
+  FMT_CONSTEXPR void set_debug_format(bool set = true) {
+    specs_.type = set ? presentation_type::debug : presentation_type::none;
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+
+template <typename Char = char> struct runtime_format_string {
+  basic_string_view<Char> str;
+};
+
+/** A compile-time format string. */
+template <typename Char, typename... Args> class basic_format_string {
+ private:
+  basic_string_view<Char> str_;
+
+ public:
+  template <typename S,
+            FMT_ENABLE_IF(
+                std::is_convertible<const S&, basic_string_view<Char>>::value)>
+  FMT_CONSTEVAL FMT_INLINE basic_format_string(const S& s) : str_(s) {
+    static_assert(
+        detail::count<
+            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+             std::is_reference<Args>::value)...>() == 0,
+        "passing views as lvalues is disallowed");
+#ifdef FMT_HAS_CONSTEVAL
+    if constexpr (detail::count_named_args<Args...>() ==
+                  detail::count_statically_named_args<Args...>()) {
+      using checker =
+          detail::format_string_checker<Char, remove_cvref_t<Args>...>;
+      detail::parse_format_string<true>(str_, checker(s));
+    }
+#else
+    detail::check_format_string<Args...>(s);
+#endif
+  }
+  basic_format_string(runtime_format_string<Char> fmt) : str_(fmt.str) {}
+
+  FMT_INLINE operator basic_string_view<Char>() const { return str_; }
+  FMT_INLINE auto get() const -> basic_string_view<Char> { return str_; }
+};
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename...> using format_string = string_view;
+inline auto runtime(string_view s) -> string_view { return s; }
+#else
+template <typename... Args>
+using format_string = basic_format_string<char, type_identity_t<Args>...>;
+/**
+  \rst
+  Creates a runtime format string.
+
+  **Example**::
+
+    // Check format string at runtime instead of compile-time.
+    fmt::print(fmt::runtime("{:d}"), "I am not a number");
+  \endrst
+ */
+inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
+#endif
+
+FMT_API auto vformat(string_view fmt, format_args args) -> basic_string<char>;
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and returns the result
+  as a string.
+
+  **Example**::
+
+    #include <fmt/core.h>
+    std::string message = fmt::format("The answer is {}.", 42);
+  \endrst
+*/
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> basic_string<char> {
+  return vformat(fmt, fmt::make_format_args(args...));
+}
+
+/** Formats a string and writes the output to ``out``. */
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ \rst
+ Formats ``args`` according to specifications in ``fmt``, writes the result to
+ the output iterator ``out`` and returns the iterator past the end of the output
+ range. `format_to` does not append a terminating null character.
+
+ **Example**::
+
+   auto out = std::vector<char>();
+   fmt::format_to(fmt::back_inserter(out), "{}", 42);
+ \endrst
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to(OutputIt out, format_string<T...> fmt, T&&... args)
+    -> OutputIt {
+  return vformat_to(out, fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /** Iterator past the end of the output range. */
+  OutputIt out;
+  /** Total (not truncated) output size. */
+  size_t size;
+};
+
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
+  return {buf.out(), buf.count()};
+}
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt``, writes up to ``n``
+  characters of the result to the output iterator ``out`` and returns the total
+  (not truncated) output size and the iterator past the end of the output range.
+  `format_to_n` does not append a terminating null character.
+  \endrst
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
+}
+
+/** Returns the number of chars in the output of ``format(fmt, args...)``. */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...), {});
+  return buf.count();
+}
+
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(std::FILE* f, string_view fmt, format_args args);
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and writes the output
+  to ``stdout``.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(fmt, vargs)
+                           : detail::vprint_mojibake(stdout, fmt, vargs);
+}
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and writes the
+  output to the file ``f``.
+
+  **Example**::
+
+    fmt::print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+template <typename... T>
+FMT_INLINE void print(std::FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(f, fmt, vargs)
+                           : detail::vprint_mojibake(f, fmt, vargs);
+}
+
+/**
+  Formats ``args`` according to specifications in ``fmt`` and writes the
+  output to the file ``f`` followed by a newline.
+ */
+template <typename... T>
+FMT_INLINE void println(std::FILE* f, format_string<T...> fmt, T&&... args) {
+  return fmt::print(f, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+/**
+  Formats ``args`` according to specifications in ``fmt`` and writes the output
+  to ``stdout`` followed by a newline.
+ */
+template <typename... T>
+FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
+  return fmt::println(stdout, fmt, std::forward<T>(args)...);
+}
+
+FMT_END_EXPORT
+FMT_GCC_PRAGMA("GCC pop_options")
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
+#endif  // FMT_CORE_H_
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index a5b79dbe49..8da1c17f36 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -8,36 +8,32 @@
 #ifndef FMT_FORMAT_INL_H_
 #define FMT_FORMAT_INL_H_
 
-#ifndef FMT_MODULE
-#  include <algorithm>
-#  include <cerrno>  // errno
-#  include <climits>
-#  include <cmath>
-#  include <exception>
+#include <algorithm>
+#include <cerrno>  // errno
+#include <climits>
+#include <cmath>
+#include <exception>
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#  include <locale>
 #endif
 
-#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+#if defined(_WIN32) && !defined(FMT_WINDOWS_NO_WCHAR)
 #  include <io.h>  // _isatty
 #endif
 
 #include "format.h"
 
-#if FMT_USE_LOCALE
-#  include <locale>
-#endif
-
-#ifndef FMT_FUNC
-#  define FMT_FUNC
-#endif
-
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
 FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
   // Use unchecked std::fprintf to avoid triggering another assertion when
-  // writing to stderr fails.
-  fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
-  abort();
+  // writing to stderr fails
+  std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
+  // Chosen instead of std::abort to satisfy Clang in CUDA mode during device
+  // code pass.
+  std::terminate();
 }
 
 FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
@@ -56,105 +52,93 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
     ++error_code_size;
   }
   error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
-  auto it = appender(out);
+  auto it = buffer_appender<char>(out);
   if (message.size() <= inline_buffer_size - error_code_size)
     fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
   fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
   FMT_ASSERT(out.size() <= inline_buffer_size, "");
 }
 
-FMT_FUNC void do_report_error(format_func func, int error_code,
-                              const char* message) noexcept {
+FMT_FUNC void report_error(format_func func, int error_code,
+                           const char* message) noexcept {
   memory_buffer full_message;
   func(full_message, error_code, message);
-  // Don't use fwrite_all because the latter may throw.
+  // Don't use fwrite_fully because the latter may throw.
   if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
     std::fputc('\n', stderr);
 }
 
 // A wrapper around fwrite that throws on error.
-inline void fwrite_all(const void* ptr, size_t count, FILE* stream) {
+inline void fwrite_fully(const void* ptr, size_t count, FILE* stream) {
   size_t written = std::fwrite(ptr, 1, count, stream);
   if (written < count)
     FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
 }
 
-#if FMT_USE_LOCALE
-using std::locale;
-using std::numpunct;
-using std::use_facet;
-
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 template <typename Locale>
 locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
-  static_assert(std::is_same<Locale, locale>::value, "");
+  static_assert(std::is_same<Locale, std::locale>::value, "");
 }
-#else
-struct locale {};
-template <typename Char> struct numpunct {
-  auto grouping() const -> std::string { return "\03"; }
-  auto thousands_sep() const -> Char { return ','; }
-  auto decimal_point() const -> Char { return '.'; }
-};
-template <typename Facet> Facet use_facet(locale) { return {}; }
-#endif  // FMT_USE_LOCALE
 
 template <typename Locale> auto locale_ref::get() const -> Locale {
-  static_assert(std::is_same<Locale, locale>::value, "");
-#if FMT_USE_LOCALE
-  if (locale_) return *static_cast<const locale*>(locale_);
-#endif
-  return locale();
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+  return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
 }
 
 template <typename Char>
 FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
-  auto&& facet = use_facet<numpunct<Char>>(loc.get<locale>());
+  auto& facet = std::use_facet<std::numpunct<Char>>(loc.get<std::locale>());
   auto grouping = facet.grouping();
   auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
   return {std::move(grouping), thousands_sep};
 }
 template <typename Char>
 FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
-  return use_facet<numpunct<Char>>(loc.get<locale>()).decimal_point();
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
+      .decimal_point();
 }
+#else
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result<Char> {
+  return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR};
+}
+template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref) {
+  return '.';
+}
+#endif
 
-#if FMT_USE_LOCALE
 FMT_FUNC auto write_loc(appender out, loc_value value,
-                        const format_specs& specs, locale_ref loc) -> bool {
+                        const format_specs<>& specs, locale_ref loc) -> bool {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
   auto locale = loc.get<std::locale>();
   // We cannot use the num_put<char> facet because it may produce output in
   // a wrong encoding.
   using facet = format_facet<std::locale>;
   if (std::has_facet<facet>(locale))
-    return use_facet<facet>(locale).put(out, value, specs);
+    return std::use_facet<facet>(locale).put(out, value, specs);
   return facet(locale).put(out, value, specs);
-}
 #endif
+  return false;
+}
 }  // namespace detail
 
-FMT_FUNC void report_error(const char* message) {
-#if FMT_USE_EXCEPTIONS
-  // Use FMT_THROW instead of throw to avoid bogus unreachable code warnings
-  // from MSVC.
+FMT_FUNC void throw_format_error(const char* message) {
   FMT_THROW(format_error(message));
-#else
-  fputs(message, stderr);
-  abort();
-#endif
 }
 
 template <typename Locale> typename Locale::id format_facet<Locale>::id;
 
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 template <typename Locale> format_facet<Locale>::format_facet(Locale& loc) {
-  auto& np = detail::use_facet<detail::numpunct<char>>(loc);
-  grouping_ = np.grouping();
-  if (!grouping_.empty()) separator_ = std::string(1, np.thousands_sep());
+  auto& numpunct = std::use_facet<std::numpunct<char>>(loc);
+  grouping_ = numpunct.grouping();
+  if (!grouping_.empty()) separator_ = std::string(1, numpunct.thousands_sep());
 }
 
-#if FMT_USE_LOCALE
 template <>
 FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
-    appender out, loc_value val, const format_specs& specs) const -> bool {
+    appender out, loc_value val, const format_specs<>& specs) const -> bool {
   return val.visit(
       detail::loc_writer<>{out, specs, separator_, grouping_, decimal_point_});
 }
@@ -1427,7 +1411,7 @@ FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
                                   const char* message) noexcept {
   FMT_TRY {
     auto ec = std::error_code(error_code, std::generic_category());
-    detail::write(appender(out), std::system_error(ec, message).what());
+    write(std::back_inserter(out), std::system_error(ec, message).what());
     return;
   }
   FMT_CATCH(...) {}
@@ -1436,7 +1420,7 @@ FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
 
 FMT_FUNC void report_system_error(int error_code,
                                   const char* message) noexcept {
-  do_report_error(format_system_error, error_code, message);
+  report_error(format_system_error, error_code, message);
 }
 
 FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
@@ -1448,251 +1432,7 @@ FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
 }
 
 namespace detail {
-
-FMT_FUNC void vformat_to(buffer<char>& buf, string_view fmt, format_args args,
-                         locale_ref loc) {
-  auto out = appender(buf);
-  if (fmt.size() == 2 && equal2(fmt.data(), "{}"))
-    return args.get(0).visit(default_arg_formatter<char>{out});
-  parse_format_string(
-      fmt, format_handler<char>{parse_context<char>(fmt), {out, args, loc}});
-}
-
-template <typename T> struct span {
-  T* data;
-  size_t size;
-};
-
-template <typename F> auto flockfile(F* f) -> decltype(_lock_file(f)) {
-  _lock_file(f);
-}
-template <typename F> auto funlockfile(F* f) -> decltype(_unlock_file(f)) {
-  _unlock_file(f);
-}
-
-#ifndef getc_unlocked
-template <typename F> auto getc_unlocked(F* f) -> decltype(_fgetc_nolock(f)) {
-  return _fgetc_nolock(f);
-}
-#endif
-
-template <typename F = FILE, typename Enable = void>
-struct has_flockfile : std::false_type {};
-
-template <typename F>
-struct has_flockfile<F, void_t<decltype(flockfile(&std::declval<F&>()))>>
-    : std::true_type {};
-
-// A FILE wrapper. F is FILE defined as a template parameter to make system API
-// detection work.
-template <typename F> class file_base {
- public:
-  F* file_;
-
- public:
-  file_base(F* file) : file_(file) {}
-  operator F*() const { return file_; }
-
-  // Reads a code unit from the stream.
-  auto get() -> int {
-    int result = getc_unlocked(file_);
-    if (result == EOF && ferror(file_) != 0)
-      FMT_THROW(system_error(errno, FMT_STRING("getc failed")));
-    return result;
-  }
-
-  // Puts the code unit back into the stream buffer.
-  void unget(char c) {
-    if (ungetc(c, file_) == EOF)
-      FMT_THROW(system_error(errno, FMT_STRING("ungetc failed")));
-  }
-
-  void flush() { fflush(this->file_); }
-};
-
-// A FILE wrapper for glibc.
-template <typename F> class glibc_file : public file_base<F> {
- private:
-  enum {
-    line_buffered = 0x200,  // _IO_LINE_BUF
-    unbuffered = 2          // _IO_UNBUFFERED
-  };
-
- public:
-  using file_base<F>::file_base;
-
-  auto is_buffered() const -> bool {
-    return (this->file_->_flags & unbuffered) == 0;
-  }
-
-  void init_buffer() {
-    if (this->file_->_IO_write_ptr) return;
-    // Force buffer initialization by placing and removing a char in a buffer.
-    assume(this->file_->_IO_write_ptr >= this->file_->_IO_write_end);
-    putc_unlocked(0, this->file_);
-    --this->file_->_IO_write_ptr;
-  }
-
-  // Returns the file's read buffer.
-  auto get_read_buffer() const -> span<const char> {
-    auto ptr = this->file_->_IO_read_ptr;
-    return {ptr, to_unsigned(this->file_->_IO_read_end - ptr)};
-  }
-
-  // Returns the file's write buffer.
-  auto get_write_buffer() const -> span<char> {
-    auto ptr = this->file_->_IO_write_ptr;
-    return {ptr, to_unsigned(this->file_->_IO_buf_end - ptr)};
-  }
-
-  void advance_write_buffer(size_t size) { this->file_->_IO_write_ptr += size; }
-
-  bool needs_flush() const {
-    if ((this->file_->_flags & line_buffered) == 0) return false;
-    char* end = this->file_->_IO_write_end;
-    return memchr(end, '\n', to_unsigned(this->file_->_IO_write_ptr - end));
-  }
-
-  void flush() { fflush_unlocked(this->file_); }
-};
-
-// A FILE wrapper for Apple's libc.
-template <typename F> class apple_file : public file_base<F> {
- private:
-  enum {
-    line_buffered = 1,  // __SNBF
-    unbuffered = 2      // __SLBF
-  };
-
- public:
-  using file_base<F>::file_base;
-
-  auto is_buffered() const -> bool {
-    return (this->file_->_flags & unbuffered) == 0;
-  }
-
-  void init_buffer() {
-    if (this->file_->_p) return;
-    // Force buffer initialization by placing and removing a char in a buffer.
-    putc_unlocked(0, this->file_);
-    --this->file_->_p;
-    ++this->file_->_w;
-  }
-
-  auto get_read_buffer() const -> span<const char> {
-    return {reinterpret_cast<char*>(this->file_->_p),
-            to_unsigned(this->file_->_r)};
-  }
-
-  auto get_write_buffer() const -> span<char> {
-    return {reinterpret_cast<char*>(this->file_->_p),
-            to_unsigned(this->file_->_bf._base + this->file_->_bf._size -
-                        this->file_->_p)};
-  }
-
-  void advance_write_buffer(size_t size) {
-    this->file_->_p += size;
-    this->file_->_w -= size;
-  }
-
-  bool needs_flush() const {
-    if ((this->file_->_flags & line_buffered) == 0) return false;
-    return memchr(this->file_->_p + this->file_->_w, '\n',
-                  to_unsigned(-this->file_->_w));
-  }
-};
-
-// A fallback FILE wrapper.
-template <typename F> class fallback_file : public file_base<F> {
- private:
-  char next_;  // The next unconsumed character in the buffer.
-  bool has_next_ = false;
-
- public:
-  using file_base<F>::file_base;
-
-  auto is_buffered() const -> bool { return false; }
-  auto needs_flush() const -> bool { return false; }
-  void init_buffer() {}
-
-  auto get_read_buffer() const -> span<const char> {
-    return {&next_, has_next_ ? 1u : 0u};
-  }
-
-  auto get_write_buffer() const -> span<char> { return {nullptr, 0}; }
-
-  void advance_write_buffer(size_t) {}
-
-  auto get() -> int {
-    has_next_ = false;
-    return file_base<F>::get();
-  }
-
-  void unget(char c) {
-    file_base<F>::unget(c);
-    next_ = c;
-    has_next_ = true;
-  }
-};
-
-#ifndef FMT_USE_FALLBACK_FILE
-#  define FMT_USE_FALLBACK_FILE 0
-#endif
-
-template <typename F,
-          FMT_ENABLE_IF(sizeof(F::_p) != 0 && !FMT_USE_FALLBACK_FILE)>
-auto get_file(F* f, int) -> apple_file<F> {
-  return f;
-}
-template <typename F,
-          FMT_ENABLE_IF(sizeof(F::_IO_read_ptr) != 0 && !FMT_USE_FALLBACK_FILE)>
-inline auto get_file(F* f, int) -> glibc_file<F> {
-  return f;
-}
-
-inline auto get_file(FILE* f, ...) -> fallback_file<FILE> { return f; }
-
-using file_ref = decltype(get_file(static_cast<FILE*>(nullptr), 0));
-
-template <typename F = FILE, typename Enable = void>
-class file_print_buffer : public buffer<char> {
- public:
-  explicit file_print_buffer(F*) : buffer(nullptr, size_t()) {}
-};
-
-template <typename F>
-class file_print_buffer<F, enable_if_t<has_flockfile<F>::value>>
-    : public buffer<char> {
- private:
-  file_ref file_;
-
-  static void grow(buffer<char>& base, size_t) {
-    auto& self = static_cast<file_print_buffer&>(base);
-    self.file_.advance_write_buffer(self.size());
-    if (self.file_.get_write_buffer().size == 0) self.file_.flush();
-    auto buf = self.file_.get_write_buffer();
-    FMT_ASSERT(buf.size > 0, "");
-    self.set(buf.data, buf.size);
-    self.clear();
-  }
-
- public:
-  explicit file_print_buffer(F* f) : buffer(grow, size_t()), file_(f) {
-    flockfile(f);
-    file_.init_buffer();
-    auto buf = file_.get_write_buffer();
-    set(buf.data, buf.size);
-  }
-  ~file_print_buffer() {
-    file_.advance_write_buffer(size());
-    bool flush = file_.needs_flush();
-    F* f = file_;    // Make funlockfile depend on the template parameter F
-    funlockfile(f);  // for the system API detection to work.
-    if (flush) fflush(file_);
-  }
-};
-
-#if !defined(_WIN32) || defined(FMT_USE_WRITE_CONSOLE)
+#if !defined(_WIN32) || defined(FMT_WINDOWS_NO_WCHAR)
 FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
 #else
 using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
@@ -1708,44 +1448,28 @@ FMT_FUNC bool write_console(int fd, string_view text) {
 
 #ifdef _WIN32
 // Print assuming legacy (non-Unicode) encoding.
-FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args,
-                              bool newline) {
+FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args) {
   auto buffer = memory_buffer();
   detail::vformat_to(buffer, fmt, args);
-  if (newline) buffer.push_back('\n');
-  fwrite_all(buffer.data(), buffer.size(), f);
+  fwrite_fully(buffer.data(), buffer.size(), f);
 }
 #endif
 
 FMT_FUNC void print(std::FILE* f, string_view text) {
-#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+#ifdef _WIN32
   int fd = _fileno(f);
   if (_isatty(fd)) {
     std::fflush(f);
     if (write_console(fd, text)) return;
   }
 #endif
-  fwrite_all(text.data(), text.size(), f);
+  fwrite_fully(text.data(), text.size(), f);
 }
 }  // namespace detail
 
-FMT_FUNC void vprint_buffered(std::FILE* f, string_view fmt, format_args args) {
-  auto buffer = memory_buffer();
-  detail::vformat_to(buffer, fmt, args);
-  detail::print(f, {buffer.data(), buffer.size()});
-}
-
 FMT_FUNC void vprint(std::FILE* f, string_view fmt, format_args args) {
-  if (!detail::file_ref(f).is_buffered() || !detail::has_flockfile<>())
-    return vprint_buffered(f, fmt, args);
-  auto&& buffer = detail::file_print_buffer<>(f);
-  return detail::vformat_to(buffer, fmt, args);
-}
-
-FMT_FUNC void vprintln(std::FILE* f, string_view fmt, format_args args) {
   auto buffer = memory_buffer();
   detail::vformat_to(buffer, fmt, args);
-  buffer.push_back('\n');
   detail::print(f, {buffer.data(), buffer.size()});
 }
 
diff --git a/src/fmt/format.h b/src/fmt/format.h
index 92a1d5b7a0..8cdf95b7bd 100644
--- a/src/fmt/format.h
+++ b/src/fmt/format.h
@@ -33,58 +33,30 @@
 #ifndef FMT_FORMAT_H_
 #define FMT_FORMAT_H_
 
-#ifndef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
-#  define _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
-#  define FMT_REMOVE_TRANSITIVE_INCLUDES
+#include <cmath>             // std::signbit
+#include <cstdint>           // uint32_t
+#include <cstring>           // std::memcpy
+#include <initializer_list>  // std::initializer_list
+#include <iterator>
+#include <limits>        // std::numeric_limits
+#include <memory>        // std::uninitialized_copy
+#include <stdexcept>     // std::runtime_error
+#include <system_error>  // std::system_error
+
+#ifdef __cpp_lib_bit_cast
+#  include <bit>  // std::bit_cast
 #endif
 
-#include "base.h"
-
-#ifndef FMT_MODULE
-#  include <cmath>    // std::signbit
-#  include <cstddef>  // std::byte
-#  include <cstdint>  // uint32_t
-#  include <cstring>  // std::memcpy
-#  include <limits>   // std::numeric_limits
-#  include <new>      // std::bad_alloc
-#  if defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI)
-// Workaround for pre gcc 5 libstdc++.
-#    include <memory>  // std::allocator_traits
-#  endif
-#  include <stdexcept>     // std::runtime_error
-#  include <string>        // std::string
-#  include <system_error>  // std::system_error
-
-// Check FMT_CPLUSPLUS to avoid a warning in MSVC.
-#  if FMT_HAS_INCLUDE(<bit>) && FMT_CPLUSPLUS > 201703L
-#    include <bit>  // std::bit_cast
-#  endif
+#include "core.h"
 
 // libc++ supports string_view in pre-c++17.
-#  if FMT_HAS_INCLUDE(<string_view>) && \
-      (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
-#    include <string_view>
-#    define FMT_USE_STRING_VIEW
-#  endif
-
-#  if FMT_MSC_VERSION
-#    include <intrin.h>  // _BitScanReverse[64], _umul128
-#  endif
-#endif  // FMT_MODULE
-
-#if defined(FMT_USE_NONTYPE_TEMPLATE_ARGS)
-// Use the provided definition.
-#elif defined(__NVCOMPILER)
-#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
-#elif FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L
-#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
-#elif defined(__cpp_nontype_template_args) && \
-    __cpp_nontype_template_args >= 201911L
-#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
-#elif FMT_CLANG_VERSION >= 1200 && FMT_CPLUSPLUS >= 202002L
-#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
-#else
-#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#if FMT_HAS_INCLUDE(<string_view>) && \
+    (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#  include <string_view>
+#  define FMT_USE_STRING_VIEW
+#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
+#  include <experimental/string_view>
+#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
 #endif
 
 #if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
@@ -93,15 +65,29 @@
 #  define FMT_INLINE_VARIABLE
 #endif
 
-// Check if RTTI is disabled.
-#ifdef FMT_USE_RTTI
-// Use the provided definition.
-#elif defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || defined(_CPPRTTI) || \
-    defined(__INTEL_RTTI__) || defined(__RTTI)
-// __RTTI is for EDG compilers. _CPPRTTI is for MSVC.
-#  define FMT_USE_RTTI 1
+#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#elif defined(__clang__)
+#  define FMT_FALLTHROUGH [[clang::fallthrough]]
+#elif FMT_GCC_VERSION >= 700 && \
+    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
 #else
-#  define FMT_USE_RTTI 0
+#  define FMT_FALLTHROUGH
+#endif
+
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  if FMT_CPLUSPLUS >= 202002L
+#    if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
+#      define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
+#    elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
+#      define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#    endif
+#  endif
+#endif
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  define FMT_NO_UNIQUE_ADDRESS
 #endif
 
 // Visibility when compiled as a shared library/object.
@@ -111,25 +97,20 @@
 #  define FMT_SO_VISIBILITY(value)
 #endif
 
+#ifdef __has_builtin
+#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define FMT_HAS_BUILTIN(x) 0
+#endif
+
 #if FMT_GCC_VERSION || FMT_CLANG_VERSION
 #  define FMT_NOINLINE __attribute__((noinline))
 #else
 #  define FMT_NOINLINE
 #endif
 
-namespace std {
-template <typename T> struct iterator_traits<fmt::basic_appender<T>> {
-  using iterator_category = output_iterator_tag;
-  using value_type = T;
-  using difference_type =
-      decltype(static_cast<int*>(nullptr) - static_cast<int*>(nullptr));
-  using pointer = void;
-  using reference = void;
-};
-}  // namespace std
-
 #ifndef FMT_THROW
-#  if FMT_USE_EXCEPTIONS
+#  if FMT_EXCEPTIONS
 #    if FMT_MSC_VERSION || defined(__NVCC__)
 FMT_BEGIN_NAMESPACE
 namespace detail {
@@ -148,8 +129,38 @@ FMT_END_NAMESPACE
 #  else
 #    define FMT_THROW(x) \
       ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
-#  endif  // FMT_USE_EXCEPTIONS
-#endif    // FMT_THROW
+#  endif
+#endif
+
+#if FMT_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#ifndef FMT_MAYBE_UNUSED
+#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
+#    define FMT_MAYBE_UNUSED [[maybe_unused]]
+#  else
+#    define FMT_MAYBE_UNUSED
+#  endif
+#endif
+
+#ifndef FMT_USE_USER_DEFINED_LITERALS
+// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
+//
+// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
+// compiler versions.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 || \
+       FMT_MSC_VERSION >= 1900) &&                                     \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
+#    define FMT_USE_USER_DEFINED_LITERALS 1
+#  else
+#    define FMT_USE_USER_DEFINED_LITERALS 0
+#  endif
+#endif
 
 // Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
 // integer formatter template instantiations to just one by only using the
@@ -159,15 +170,7 @@ FMT_END_NAMESPACE
 #  define FMT_REDUCE_INT_INSTANTIATIONS 0
 #endif
 
-FMT_BEGIN_NAMESPACE
-
-template <typename Char, typename Traits, typename Allocator>
-struct is_contiguous<std::basic_string<Char, Traits, Allocator>>
-    : std::true_type {};
-
-namespace detail {
-
-// __builtin_clz is broken in clang with Microsoft codegen:
+// __builtin_clz is broken in clang with Microsoft CodeGen:
 // https://github.com/fmtlib/fmt/issues/519.
 #if !FMT_MSC_VERSION
 #  if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
@@ -178,30 +181,53 @@ namespace detail {
 #  endif
 #endif
 
-// Some compilers masquerade as both MSVC and GCC but otherwise support
+// __builtin_ctz is broken in Intel Compiler Classic on Windows:
+// https://github.com/fmtlib/fmt/issues/2510.
+#ifndef __ICL
+#  if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION || \
+      defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || \
+      FMT_ICC_VERSION || defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#  endif
+#endif
+
+#if FMT_MSC_VERSION
+#  include <intrin.h>  // _BitScanReverse[64], _BitScanForward[64], _umul128
+#endif
+
+// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
 // __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
 // MSVC intrinsics if the clz and clzll builtins are not available.
-#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL)
+#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL) && \
+    !defined(FMT_BUILTIN_CTZLL)
+FMT_BEGIN_NAMESPACE
+namespace detail {
 // Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
-#  ifndef __clang__
+#  if !defined(__clang__)
+#    pragma intrinsic(_BitScanForward)
 #    pragma intrinsic(_BitScanReverse)
-#    ifdef _WIN64
+#    if defined(_WIN64)
+#      pragma intrinsic(_BitScanForward64)
 #      pragma intrinsic(_BitScanReverse64)
 #    endif
 #  endif
 
 inline auto clz(uint32_t x) -> int {
-  FMT_ASSERT(x != 0, "");
-  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
   unsigned long r = 0;
   _BitScanReverse(&r, x);
+  FMT_ASSERT(x != 0, "");
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+  FMT_MSC_WARNING(suppress : 6102)
   return 31 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZ(n) detail::clz(n)
 
 inline auto clzll(uint64_t x) -> int {
-  FMT_ASSERT(x != 0, "");
-  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
   unsigned long r = 0;
 #  ifdef _WIN64
   _BitScanReverse64(&r, x);
@@ -212,10 +238,55 @@ inline auto clzll(uint64_t x) -> int {
   // Scan the low 32 bits.
   _BitScanReverse(&r, static_cast<uint32_t>(x));
 #  endif
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
   return 63 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
-#endif  // FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL)
+
+inline auto ctz(uint32_t x) -> int {
+  unsigned long r = 0;
+  _BitScanForward(&r, x);
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
+
+inline auto ctzll(uint64_t x) -> int {
+  unsigned long r = 0;
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+#  ifdef _WIN64
+  _BitScanForward64(&r, x);
+#  else
+  // Scan the low 32 bits.
+  if (_BitScanForward(&r, static_cast<uint32_t>(x))) return static_cast<int>(r);
+  // Scan the high 32 bits.
+  _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
+  r += 32;
+#  endif
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
+}  // namespace detail
+FMT_END_NAMESPACE
+#endif
+
+namespace std {
+template <> struct iterator_traits<fmt::appender> {
+  using value_type = void;
+  using iterator_category = std::output_iterator_tag;
+};
+template <typename Container>
+struct iterator_traits<fmt::back_insert_iterator<Container>> {
+  using value_type = void;
+  using iterator_category = std::output_iterator_tag;
+};
+}  // namespace std
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
 
 FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
   ignore_unused(condition);
@@ -226,19 +297,23 @@ FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
 
 #if defined(FMT_USE_STRING_VIEW)
 template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
+template <typename Char>
+using std_string_view = std::experimental::basic_string_view<Char>;
 #else
 template <typename T> struct std_string_view {};
 #endif
 
-template <typename Char, Char... C> struct string_literal {
-  static constexpr Char value[sizeof...(C)] = {C...};
-  constexpr operator basic_string_view<Char>() const {
+template <typename CharT, CharT... C> struct string_literal {
+  static constexpr CharT value[sizeof...(C)] = {C...};
+  constexpr operator basic_string_view<CharT>() const {
     return {value, sizeof...(C)};
   }
 };
+
 #if FMT_CPLUSPLUS < 201703L
-template <typename Char, Char... C>
-constexpr Char string_literal<Char, C...>::value[sizeof...(C)];
+template <typename CharT, CharT... C>
+constexpr CharT string_literal<CharT, C...>::value[sizeof...(C)];
 #endif
 
 // Implementation of std::bit_cast for pre-C++20.
@@ -310,14 +385,13 @@ class uint128_fallback {
       -> uint128_fallback {
     return {~n.hi_, ~n.lo_};
   }
-  friend FMT_CONSTEXPR auto operator+(const uint128_fallback& lhs,
-                                      const uint128_fallback& rhs)
-      -> uint128_fallback {
+  friend auto operator+(const uint128_fallback& lhs,
+                        const uint128_fallback& rhs) -> uint128_fallback {
     auto result = uint128_fallback(lhs);
     result += rhs;
     return result;
   }
-  friend FMT_CONSTEXPR auto operator*(const uint128_fallback& lhs, uint32_t rhs)
+  friend auto operator*(const uint128_fallback& lhs, uint32_t rhs)
       -> uint128_fallback {
     FMT_ASSERT(lhs.hi_ == 0, "");
     uint64_t hi = (lhs.lo_ >> 32) * rhs;
@@ -325,7 +399,7 @@ class uint128_fallback {
     uint64_t new_lo = (hi << 32) + lo;
     return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo};
   }
-  friend constexpr auto operator-(const uint128_fallback& lhs, uint64_t rhs)
+  friend auto operator-(const uint128_fallback& lhs, uint64_t rhs)
       -> uint128_fallback {
     return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs};
   }
@@ -398,24 +472,23 @@ template <typename T> constexpr auto num_bits() -> int {
 }
 // std::numeric_limits<T>::digits may return 0 for 128-bit ints.
 template <> constexpr auto num_bits<int128_opt>() -> int { return 128; }
-template <> constexpr auto num_bits<uint128_opt>() -> int { return 128; }
-template <> constexpr auto num_bits<uint128_fallback>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_t>() -> int { return 128; }
 
 // A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
 // and 128-bit pointers to uint128_fallback.
 template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) > sizeof(From))>
 inline auto bit_cast(const From& from) -> To {
-  constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned short));
+  constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned));
   struct data_t {
-    unsigned short value[static_cast<unsigned>(size)];
+    unsigned value[static_cast<unsigned>(size)];
   } data = bit_cast<data_t>(from);
   auto result = To();
   if (const_check(is_big_endian())) {
     for (int i = 0; i < size; ++i)
-      result = (result << num_bits<unsigned short>()) | data.value[i];
+      result = (result << num_bits<unsigned>()) | data.value[i];
   } else {
     for (int i = size - 1; i >= 0; --i)
-      result = (result << num_bits<unsigned short>()) | data.value[i];
+      result = (result << num_bits<unsigned>()) | data.value[i];
   }
   return result;
 }
@@ -451,25 +524,84 @@ FMT_INLINE void assume(bool condition) {
 #endif
 }
 
+// Extracts a reference to the container from back_insert_iterator.
+template <typename Container>
+inline auto get_container(std::back_insert_iterator<Container> it)
+    -> Container& {
+  using base = std::back_insert_iterator<Container>;
+  struct accessor : base {
+    accessor(base b) : base(b) {}
+    using base::container;
+  };
+  return *accessor(it).container;
+}
+
+template <typename Char, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  while (begin != end) *out++ = static_cast<Char>(*begin++);
+  return out;
+}
+
+template <typename Char, typename T, typename U,
+          FMT_ENABLE_IF(
+              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
+FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
+  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
+  auto size = to_unsigned(end - begin);
+  if (size > 0) memcpy(out, begin, size * sizeof(U));
+  return out + size;
+}
+
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
+  get_container(out).append(begin, end);
+  return out;
+}
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, back_insert_iterator<std::string> out)
+    -> back_insert_iterator<std::string> {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename Char, typename R, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt {
+  return detail::copy_str<Char>(rng.begin(), rng.end(), out);
+}
+
+// An approximation of iterator_t for pre-C++20 systems.
+template <typename T>
+using iterator_t = decltype(std::begin(std::declval<T&>()));
+template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
+
+// A workaround for std::string not having mutable data() until C++17.
+template <typename Char>
+inline auto get_data(std::basic_string<Char>& s) -> Char* {
+  return &s[0];
+}
+template <typename Container>
+inline auto get_data(Container& c) -> typename Container::value_type* {
+  return c.data();
+}
+
 // Attempts to reserve space for n extra characters in the output range.
 // Returns a pointer to the reserved range or a reference to it.
-template <typename OutputIt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
-                            is_contiguous<typename OutputIt::container>::value)>
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
 #if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
 __attribute__((no_sanitize("undefined")))
 #endif
-FMT_CONSTEXPR20 inline auto
-reserve(OutputIt it, size_t n) -> typename OutputIt::value_type* {
-  auto& c = get_container(it);
+inline auto
+reserve(std::back_insert_iterator<Container> it, size_t n) ->
+    typename Container::value_type* {
+  Container& c = get_container(it);
   size_t size = c.size();
   c.resize(size + n);
-  return &c[size];
+  return get_data(c) + size;
 }
 
 template <typename T>
-FMT_CONSTEXPR20 inline auto reserve(basic_appender<T> it, size_t n)
-    -> basic_appender<T> {
+inline auto reserve(buffer_appender<T> it, size_t n) -> buffer_appender<T> {
   buffer<T>& buf = get_container(it);
   buf.try_reserve(buf.size() + n);
   return it;
@@ -488,22 +620,18 @@ template <typename T, typename OutputIt>
 constexpr auto to_pointer(OutputIt, size_t) -> T* {
   return nullptr;
 }
-template <typename T>
-FMT_CONSTEXPR20 auto to_pointer(basic_appender<T> it, size_t n) -> T* {
+template <typename T> auto to_pointer(buffer_appender<T> it, size_t n) -> T* {
   buffer<T>& buf = get_container(it);
-  buf.try_reserve(buf.size() + n);
   auto size = buf.size();
   if (buf.capacity() < size + n) return nullptr;
   buf.try_resize(size + n);
   return buf.data() + size;
 }
 
-template <typename OutputIt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
-                            is_contiguous<typename OutputIt::container>::value)>
-inline auto base_iterator(OutputIt it,
-                          typename OutputIt::container_type::value_type*)
-    -> OutputIt {
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+inline auto base_iterator(std::back_insert_iterator<Container> it,
+                          typename Container::value_type*)
+    -> std::back_insert_iterator<Container> {
   return it;
 }
 
@@ -522,15 +650,23 @@ FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value)
 }
 template <typename T, typename Size>
 FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
-  if (is_constant_evaluated()) return fill_n<T*, Size, T>(out, count, value);
+  if (is_constant_evaluated()) {
+    return fill_n<T*, Size, T>(out, count, value);
+  }
   std::memset(out, value, to_unsigned(count));
   return out + count;
 }
 
+#ifdef __cpp_char8_t
+using char8_type = char8_t;
+#else
+enum char8_type : unsigned char {};
+#endif
+
 template <typename OutChar, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR FMT_NOINLINE auto copy_noinline(InputIt begin, InputIt end,
-                                              OutputIt out) -> OutputIt {
-  return copy<OutChar>(begin, end, out);
+FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
+                                                  OutputIt out) -> OutputIt {
+  return copy_str<OutChar>(begin, end, out);
 }
 
 // A public domain branchless UTF-8 decoder by Christopher Wellons:
@@ -601,7 +737,6 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
                     string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
     return result ? (error ? buf_ptr + 1 : end) : nullptr;
   };
-
   auto p = s.data();
   const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
   if (s.size() >= block_size) {
@@ -610,20 +745,17 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
       if (!p) return;
     }
   }
-  auto num_chars_left = to_unsigned(s.data() + s.size() - p);
-  if (num_chars_left == 0) return;
-
-  // Suppress bogus -Wstringop-overflow.
-  if (FMT_GCC_VERSION) num_chars_left &= 3;
-  char buf[2 * block_size - 1] = {};
-  copy<char>(p, p + num_chars_left, buf);
-  const char* buf_ptr = buf;
-  do {
-    auto end = decode(buf_ptr, p);
-    if (!end) return;
-    p += end - buf_ptr;
-    buf_ptr = end;
-  } while (buf_ptr < buf + num_chars_left);
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    copy_str<char>(p, p + num_chars_left, buf);
+    const char* buf_ptr = buf;
+    do {
+      auto end = decode(buf_ptr, p);
+      if (!end) return;
+      p += end - buf_ptr;
+      buf_ptr = end;
+    } while (buf_ptr - buf < num_chars_left);
+  }
 }
 
 template <typename Char>
@@ -638,7 +770,7 @@ FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
   struct count_code_points {
     size_t* count;
     FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
-      *count += to_unsigned(
+      *count += detail::to_unsigned(
           1 +
           (cp >= 0x1100 &&
            (cp <= 0x115f ||  // Hangul Jamo init. consonants
@@ -666,9 +798,15 @@ FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
   return num_code_points;
 }
 
+inline auto compute_width(basic_string_view<char8_type> s) -> size_t {
+  return compute_width(
+      string_view(reinterpret_cast<const char*>(s.data()), s.size()));
+}
+
 template <typename Char>
 inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
-  return min_of(n, s.size());
+  size_t size = s.size();
+  return n < size ? n : size;
 }
 
 // Calculates the index of the nth code point in a UTF-8 string.
@@ -686,6 +824,12 @@ inline auto code_point_index(string_view s, size_t n) -> size_t {
   return result;
 }
 
+inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
+    -> size_t {
+  return code_point_index(
+      string_view(reinterpret_cast<const char*>(s.data()), s.size()), n);
+}
+
 template <typename T> struct is_integral : std::is_integral<T> {};
 template <> struct is_integral<int128_opt> : std::true_type {};
 template <> struct is_integral<uint128_t> : std::true_type {};
@@ -701,22 +845,38 @@ using is_integer =
                   !std::is_same<T, char>::value &&
                   !std::is_same<T, wchar_t>::value>;
 
-#if defined(FMT_USE_FLOAT128)
-// Use the provided definition.
-#elif FMT_CLANG_VERSION && FMT_HAS_INCLUDE(<quadmath.h>)
-#  define FMT_USE_FLOAT128 1
-#elif FMT_GCC_VERSION && defined(_GLIBCXX_USE_FLOAT128) && \
-    !defined(__STRICT_ANSI__)
-#  define FMT_USE_FLOAT128 1
-#else
-#  define FMT_USE_FLOAT128 0
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
 #endif
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+#ifndef FMT_USE_FLOAT128
+#  ifdef __clang__
+// Clang emulates GCC, so it has to appear early.
+#    if FMT_HAS_INCLUDE(<quadmath.h>)
+#      define FMT_USE_FLOAT128 1
+#    endif
+#  elif defined(__GNUC__)
+// GNU C++:
+#    if defined(_GLIBCXX_USE_FLOAT128) && !defined(__STRICT_ANSI__)
+#      define FMT_USE_FLOAT128 1
+#    endif
+#  endif
+#  ifndef FMT_USE_FLOAT128
+#    define FMT_USE_FLOAT128 0
+#  endif
+#endif
+
 #if FMT_USE_FLOAT128
 using float128 = __float128;
 #else
-struct float128 {};
+using float128 = void;
 #endif
-
 template <typename T> using is_float128 = std::is_same<T, float128>;
 
 template <typename T>
@@ -735,21 +895,24 @@ using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
 #  define FMT_USE_FULL_CACHE_DRAGONBOX 0
 #endif
 
-// An allocator that uses malloc/free to allow removing dependency on the C++
-// standard libary runtime.
-template <typename T> struct allocator {
-  using value_type = T;
-
-  T* allocate(size_t n) {
-    FMT_ASSERT(n <= max_value<size_t>() / sizeof(T), "");
-    T* p = static_cast<T*>(malloc(n * sizeof(T)));
-    if (!p) FMT_THROW(std::bad_alloc());
-    return p;
+template <typename T>
+template <typename U>
+void buffer<T>::append(const U* begin, const U* end) {
+  while (begin != end) {
+    auto count = to_unsigned(end - begin);
+    try_reserve(size_ + count);
+    auto free_cap = capacity_ - size_;
+    if (free_cap < count) count = free_cap;
+    std::uninitialized_copy_n(begin, count, ptr_ + size_);
+    size_ += count;
+    begin += count;
   }
+}
 
-  void deallocate(T* p, size_t) { free(p); }
-};
-
+template <typename T, typename Enable = void>
+struct is_locale : std::false_type {};
+template <typename T>
+struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
 }  // namespace detail
 
 FMT_BEGIN_EXPORT
@@ -759,21 +922,29 @@ FMT_BEGIN_EXPORT
 enum { inline_buffer_size = 500 };
 
 /**
- * A dynamically growing memory buffer for trivially copyable/constructible
- * types with the first `SIZE` elements stored in the object itself. Most
- * commonly used via the `memory_buffer` alias for `char`.
- *
- * **Example**:
- *
- *     auto out = fmt::memory_buffer();
- *     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
- *
- * This will append "The answer is 42." to `out`. The buffer content can be
- * converted to `std::string` with `to_string(out)`.
+  \rst
+  A dynamically growing memory buffer for trivially copyable/constructible types
+  with the first ``SIZE`` elements stored in the object itself.
+
+  You can use the ``memory_buffer`` type alias for ``char`` instead.
+
+  **Example**::
+
+     auto out = fmt::memory_buffer();
+     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
+
+  This will append the following output to the ``out`` object:
+
+  .. code-block:: none
+
+     The answer is 42.
+
+  The output can be converted to an ``std::string`` with ``to_string(out)``.
+  \endrst
  */
 template <typename T, size_t SIZE = inline_buffer_size,
-          typename Allocator = detail::allocator<T>>
-class basic_memory_buffer : public detail::buffer<T> {
+          typename Allocator = std::allocator<T>>
+class basic_memory_buffer final : public detail::buffer<T> {
  private:
   T store_[SIZE];
 
@@ -786,6 +957,7 @@ class basic_memory_buffer : public detail::buffer<T> {
     if (data != store_) alloc_.deallocate(data, this->capacity());
   }
 
+ protected:
   static FMT_CONSTEXPR20 void grow(detail::buffer<T>& buf, size_t size) {
     detail::abort_fuzzing_if(size > 5000);
     auto& self = static_cast<basic_memory_buffer&>(buf);
@@ -796,13 +968,14 @@ class basic_memory_buffer : public detail::buffer<T> {
     if (size > new_capacity)
       new_capacity = size;
     else if (new_capacity > max_size)
-      new_capacity = max_of(size, max_size);
+      new_capacity = size > max_size ? size : max_size;
     T* old_data = buf.data();
-    T* new_data = self.alloc_.allocate(new_capacity);
+    T* new_data =
+        std::allocator_traits<Allocator>::allocate(self.alloc_, new_capacity);
     // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
     detail::assume(buf.size() <= new_capacity);
     // The following code doesn't throw, so the raw pointer above doesn't leak.
-    memcpy(new_data, old_data, buf.size() * sizeof(T));
+    std::uninitialized_copy_n(old_data, buf.size(), new_data);
     self.set(new_data, new_capacity);
     // deallocate must not throw according to the standard, but even if it does,
     // the buffer already uses the new storage and will deallocate it in
@@ -814,7 +987,7 @@ class basic_memory_buffer : public detail::buffer<T> {
   using value_type = T;
   using const_reference = const T&;
 
-  FMT_CONSTEXPR explicit basic_memory_buffer(
+  FMT_CONSTEXPR20 explicit basic_memory_buffer(
       const Allocator& alloc = Allocator())
       : detail::buffer<T>(grow), alloc_(alloc) {
     this->set(store_, SIZE);
@@ -830,7 +1003,7 @@ class basic_memory_buffer : public detail::buffer<T> {
     size_t size = other.size(), capacity = other.capacity();
     if (data == other.store_) {
       this->set(store_, capacity);
-      detail::copy<T>(other.store_, other.store_ + size, store_);
+      detail::copy_str<T>(other.store_, other.store_ + size, store_);
     } else {
       this->set(data, capacity);
       // Set pointer to the inline array so that delete is not called
@@ -842,14 +1015,22 @@ class basic_memory_buffer : public detail::buffer<T> {
   }
 
  public:
-  /// Constructs a `basic_memory_buffer` object moving the content of the other
-  /// object to it.
+  /**
+    \rst
+    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
+    of the other object to it.
+    \endrst
+   */
   FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept
       : detail::buffer<T>(grow) {
     move(other);
   }
 
-  /// Moves the content of the other `basic_memory_buffer` object to this one.
+  /**
+    \rst
+    Moves the content of the other ``basic_memory_buffer`` object to this one.
+    \endrst
+   */
   auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& {
     FMT_ASSERT(this != &other, "");
     deallocate();
@@ -860,108 +1041,119 @@ class basic_memory_buffer : public detail::buffer<T> {
   // Returns a copy of the allocator associated with this buffer.
   auto get_allocator() const -> Allocator { return alloc_; }
 
-  /// Resizes the buffer to contain `count` elements. If T is a POD type new
-  /// elements may not be initialized.
-  FMT_CONSTEXPR void resize(size_t count) { this->try_resize(count); }
+  /**
+    Resizes the buffer to contain *count* elements. If T is a POD type new
+    elements may not be initialized.
+   */
+  FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
 
-  /// Increases the buffer capacity to `new_capacity`.
+  /** Increases the buffer capacity to *new_capacity*. */
   void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
 
   using detail::buffer<T>::append;
   template <typename ContiguousRange>
-  FMT_CONSTEXPR20 void append(const ContiguousRange& range) {
+  void append(const ContiguousRange& range) {
     append(range.data(), range.data() + range.size());
   }
 };
 
 using memory_buffer = basic_memory_buffer<char>;
 
-template <size_t SIZE>
-FMT_NODISCARD auto to_string(const basic_memory_buffer<char, SIZE>& buf)
-    -> std::string {
-  auto size = buf.size();
-  detail::assume(size < std::string().max_size());
-  return {buf.data(), size};
-}
-
-// A writer to a buffered stream. It doesn't own the underlying stream.
-class writer {
- private:
-  detail::buffer<char>* buf_;
-
-  // We cannot create a file buffer in advance because any write to a FILE may
-  // invalidate it.
-  FILE* file_;
-
- public:
-  inline writer(FILE* f) : buf_(nullptr), file_(f) {}
-  inline writer(detail::buffer<char>& buf) : buf_(&buf) {}
-
-  /// Formats `args` according to specifications in `fmt` and writes the
-  /// output to the file.
-  template <typename... T> void print(format_string<T...> fmt, T&&... args) {
-    if (buf_)
-      fmt::format_to(appender(*buf_), fmt, std::forward<T>(args)...);
-    else
-      fmt::print(file_, fmt, std::forward<T>(args)...);
-  }
-};
-
-class string_buffer {
- private:
-  std::string str_;
-  detail::container_buffer<std::string> buf_;
-
- public:
-  inline string_buffer() : buf_(str_) {}
-
-  inline operator writer() { return buf_; }
-  inline std::string& str() { return str_; }
-};
-
 template <typename T, size_t SIZE, typename Allocator>
 struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
 };
 
-// Suppress a misleading warning in older versions of clang.
-FMT_PRAGMA_CLANG(diagnostic ignored "-Wweak-vtables")
+FMT_END_EXPORT
+namespace detail {
+FMT_API auto write_console(int fd, string_view text) -> bool;
+FMT_API void print(std::FILE*, string_view);
+}  // namespace detail
 
-/// An error reported from a formatting function.
+FMT_BEGIN_EXPORT
+
+// Suppress a misleading warning in older versions of clang.
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+/** An error reported from a formatting function. */
 class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
  public:
   using std::runtime_error::runtime_error;
 };
 
-class loc_value;
-
-FMT_END_EXPORT
-namespace detail {
-FMT_API auto write_console(int fd, string_view text) -> bool;
-FMT_API void print(FILE*, string_view);
-}  // namespace detail
-
-namespace detail {
+namespace detail_exported {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
 template <typename Char, size_t N> struct fixed_string {
-  FMT_CONSTEXPR20 fixed_string(const Char (&s)[N]) {
-    detail::copy<Char, const Char*, Char*>(static_cast<const Char*>(s), s + N,
-                                           data);
+  constexpr fixed_string(const Char (&str)[N]) {
+    detail::copy_str<Char, const Char*, Char*>(static_cast<const Char*>(str),
+                                               str + N, data);
   }
   Char data[N] = {};
 };
+#endif
 
 // Converts a compile-time string to basic_string_view.
-FMT_EXPORT template <typename Char, size_t N>
+template <typename Char, size_t N>
 constexpr auto compile_string_to_view(const Char (&s)[N])
     -> basic_string_view<Char> {
   // Remove trailing NUL character if needed. Won't be present if this is used
   // with a raw character array (i.e. not defined as a string).
   return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
 }
-FMT_EXPORT template <typename Char>
-constexpr auto compile_string_to_view(basic_string_view<Char> s)
+template <typename Char>
+constexpr auto compile_string_to_view(detail::std_string_view<Char> s)
     -> basic_string_view<Char> {
-  return s;
+  return {s.data(), s.size()};
 }
+}  // namespace detail_exported
+
+class loc_value {
+ private:
+  basic_format_arg<format_context> value_;
+
+ public:
+  template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
+  loc_value(T value) : value_(detail::make_arg<format_context>(value)) {}
+
+  template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
+  loc_value(T) {}
+
+  template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    return value_.visit(vis);
+  }
+};
+
+// A locale facet that formats values in UTF-8.
+// It is parameterized on the locale to avoid the heavy <locale> include.
+template <typename Locale> class format_facet : public Locale::facet {
+ private:
+  std::string separator_;
+  std::string grouping_;
+  std::string decimal_point_;
+
+ protected:
+  virtual auto do_put(appender out, loc_value val,
+                      const format_specs<>& specs) const -> bool;
+
+ public:
+  static FMT_API typename Locale::id id;
+
+  explicit format_facet(Locale& loc);
+  explicit format_facet(string_view sep = "",
+                        std::initializer_list<unsigned char> g = {3},
+                        std::string decimal_point = ".")
+      : separator_(sep.data(), sep.size()),
+        grouping_(g.begin(), g.end()),
+        decimal_point_(decimal_point) {}
+
+  auto put(appender out, loc_value val, const format_specs<>& specs) const
+      -> bool {
+    return do_put(out, val, specs);
+  }
+};
+
+namespace detail {
 
 // Returns true if value is negative, false otherwise.
 // Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
@@ -974,6 +1166,14 @@ constexpr auto is_negative(T) -> bool {
   return false;
 }
 
+template <typename T>
+FMT_CONSTEXPR auto is_supported_floating_point(T) -> bool {
+  if (std::is_same<T, float>()) return FMT_USE_FLOAT;
+  if (std::is_same<T, double>()) return FMT_USE_DOUBLE;
+  if (std::is_same<T, long double>()) return FMT_USE_LONG_DOUBLE;
+  return true;
+}
+
 // Smallest of uint32_t, uint64_t, uint128_t that is large enough to
 // represent all values of an integral type T.
 template <typename T>
@@ -990,22 +1190,21 @@ using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
       (factor) * 100000000, (factor) * 1000000000
 
 // Converts value in the range [0, 100) to a string.
-// GCC generates slightly better code when value is pointer-size.
-inline auto digits2(size_t value) -> const char* {
-  // Align data since unaligned access may be slower when crossing a
-  // hardware-specific boundary.
-  alignas(2) static const char data[] =
-      "0001020304050607080910111213141516171819"
-      "2021222324252627282930313233343536373839"
-      "4041424344454647484950515253545556575859"
-      "6061626364656667686970717273747576777879"
-      "8081828384858687888990919293949596979899";
-  return &data[value * 2];
+constexpr auto digits2(size_t value) -> const char* {
+  // GCC generates slightly better code when value is pointer-size.
+  return &"0001020304050607080910111213141516171819"
+         "2021222324252627282930313233343536373839"
+         "4041424344454647484950515253545556575859"
+         "6061626364656667686970717273747576777879"
+         "8081828384858687888990919293949596979899"[value * 2];
 }
 
-template <typename Char> constexpr auto getsign(sign s) -> Char {
-  return static_cast<char>(((' ' << 24) | ('+' << 16) | ('-' << 8)) >>
-                           (static_cast<int>(s) * 8));
+// Sign is a template parameter to workaround a bug in gcc 4.8.
+template <typename Char, typename Sign> constexpr auto sign(Sign s) -> Char {
+#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
+  static_assert(std::is_same<Sign, sign_t>::value, "");
+#endif
+  return static_cast<Char>("\0-+ "[s]);
 }
 
 template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
@@ -1053,7 +1252,9 @@ inline auto do_count_digits(uint64_t n) -> int {
 // except for n == 0 in which case count_digits returns 1.
 FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
 #ifdef FMT_BUILTIN_CLZLL
-  if (!is_constant_evaluated() && !FMT_OPTIMIZE_SIZE) return do_count_digits(n);
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
 #endif
   return count_digits_fallback(n);
 }
@@ -1103,7 +1304,9 @@ FMT_INLINE auto do_count_digits(uint32_t n) -> int {
 // Optional version of count_digits for better performance on 32-bit platforms.
 FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
 #ifdef FMT_BUILTIN_CLZ
-  if (!is_constant_evaluated() && !FMT_OPTIMIZE_SIZE) return do_count_digits(n);
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
 #endif
   return count_digits_fallback(n);
 }
@@ -1140,17 +1343,6 @@ template <> inline auto decimal_point(locale_ref loc) -> wchar_t {
   return decimal_point_impl<wchar_t>(loc);
 }
 
-#ifndef FMT_HEADER_ONLY
-FMT_BEGIN_EXPORT
-extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
-    -> thousands_sep_result<char>;
-extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
-    -> thousands_sep_result<wchar_t>;
-extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
-extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
-FMT_END_EXPORT
-#endif  // FMT_HEADER_ONLY
-
 // Compares two characters for equality.
 template <typename Char> auto equal2(const Char* lhs, const char* rhs) -> bool {
   return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
@@ -1159,99 +1351,83 @@ inline auto equal2(const char* lhs, const char* rhs) -> bool {
   return memcmp(lhs, rhs, 2) == 0;
 }
 
-// Writes a two-digit value to out.
+// Copies two characters from src to dst.
 template <typename Char>
-FMT_CONSTEXPR20 FMT_INLINE void write2digits(Char* out, size_t value) {
-  if (!is_constant_evaluated() && std::is_same<Char, char>::value &&
-      !FMT_OPTIMIZE_SIZE) {
-    memcpy(out, digits2(value), 2);
+FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) {
+  if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) {
+    memcpy(dst, src, 2);
     return;
   }
-  *out++ = static_cast<Char>('0' + value / 10);
-  *out = static_cast<Char>('0' + value % 10);
+  *dst++ = static_cast<Char>(*src++);
+  *dst = static_cast<Char>(*src);
 }
 
-// Formats a decimal unsigned integer value writing to out pointing to a buffer
-// of specified size. The caller must ensure that the buffer is large enough.
+template <typename Iterator> struct format_decimal_result {
+  Iterator begin;
+  Iterator end;
+};
+
+// Formats a decimal unsigned integer value writing into out pointing to a
+// buffer of specified size. The caller must ensure that the buffer is large
+// enough.
 template <typename Char, typename UInt>
-FMT_CONSTEXPR20 auto do_format_decimal(Char* out, UInt value, int size)
-    -> Char* {
+FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size)
+    -> format_decimal_result<Char*> {
   FMT_ASSERT(size >= count_digits(value), "invalid digit count");
-  unsigned n = to_unsigned(size);
+  out += size;
+  Char* end = out;
   while (value >= 100) {
     // Integer division is slow so do it for a group of two digits instead
     // of for every digit. The idea comes from the talk by Alexandrescu
     // "Three Optimization Tips for C++". See speed-test for a comparison.
-    n -= 2;
-    write2digits(out + n, static_cast<unsigned>(value % 100));
+    out -= 2;
+    copy2(out, digits2(static_cast<size_t>(value % 100)));
     value /= 100;
   }
-  if (value >= 10) {
-    n -= 2;
-    write2digits(out + n, static_cast<unsigned>(value));
-  } else {
-    out[--n] = static_cast<Char>('0' + value);
+  if (value < 10) {
+    *--out = static_cast<Char>('0' + value);
+    return {out, end};
   }
-  return out + n;
+  out -= 2;
+  copy2(out, digits2(static_cast<size_t>(value)));
+  return {out, end};
 }
 
-template <typename Char, typename UInt>
-FMT_CONSTEXPR FMT_INLINE auto format_decimal(Char* out, UInt value,
-                                             int num_digits) -> Char* {
-  do_format_decimal(out, value, num_digits);
-  return out + num_digits;
-}
-
-template <typename Char, typename UInt, typename OutputIt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value)>
-FMT_CONSTEXPR auto format_decimal(OutputIt out, UInt value, int num_digits)
-    -> OutputIt {
-  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
-    do_format_decimal(ptr, value, num_digits);
-    return out;
-  }
+template <typename Char, typename UInt, typename Iterator,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
+FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size)
+    -> format_decimal_result<Iterator> {
   // Buffer is large enough to hold all digits (digits10 + 1).
-  char buffer[digits10<UInt>() + 1];
-  if (is_constant_evaluated()) fill_n(buffer, sizeof(buffer), '\0');
-  do_format_decimal(buffer, value, num_digits);
-  return copy_noinline<Char>(buffer, buffer + num_digits, out);
+  Char buffer[digits10<UInt>() + 1] = {};
+  auto end = format_decimal(buffer, value, size).end;
+  return {out, detail::copy_str_noinline<Char>(buffer, end, out)};
 }
 
-template <typename Char, typename UInt>
-FMT_CONSTEXPR auto do_format_base2e(int base_bits, Char* out, UInt value,
-                                    int size, bool upper = false) -> Char* {
-  out += size;
+template <unsigned BASE_BITS, typename Char, typename UInt>
+FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits,
+                               bool upper = false) -> Char* {
+  buffer += num_digits;
+  Char* end = buffer;
   do {
     const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
-    unsigned digit = static_cast<unsigned>(value & ((1 << base_bits) - 1));
-    *--out = static_cast<Char>(base_bits < 4 ? static_cast<char>('0' + digit)
-                                             : digits[digit]);
-  } while ((value >>= base_bits) != 0);
-  return out;
+    unsigned digit = static_cast<unsigned>(value & ((1 << BASE_BITS) - 1));
+    *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
+                                                : digits[digit]);
+  } while ((value >>= BASE_BITS) != 0);
+  return end;
 }
 
-// Formats an unsigned integer in the power of two base (binary, octal, hex).
-template <typename Char, typename UInt>
-FMT_CONSTEXPR auto format_base2e(int base_bits, Char* out, UInt value,
-                                 int num_digits, bool upper = false) -> Char* {
-  do_format_base2e(base_bits, out, value, num_digits, upper);
-  return out + num_digits;
-}
-
-template <typename Char, typename OutputIt, typename UInt,
-          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value)>
-FMT_CONSTEXPR inline auto format_base2e(int base_bits, OutputIt out, UInt value,
-                                        int num_digits, bool upper = false)
-    -> OutputIt {
+template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
+FMT_CONSTEXPR inline auto format_uint(It out, UInt value, int num_digits,
+                                      bool upper = false) -> It {
   if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
-    format_base2e(base_bits, ptr, value, num_digits, upper);
+    format_uint<BASE_BITS>(ptr, value, num_digits, upper);
     return out;
   }
-  // Make buffer large enough for any base.
-  char buffer[num_bits<UInt>()];
-  if (is_constant_evaluated()) fill_n(buffer, sizeof(buffer), '\0');
-  format_base2e(base_bits, buffer, value, num_digits, upper);
-  return detail::copy_noinline<Char>(buffer, buffer + num_digits, out);
+  // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
+  char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
+  format_uint<BASE_BITS>(buffer, value, num_digits, upper);
+  return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
 }
 
 // A converter from UTF-8 to UTF-16.
@@ -1261,12 +1437,10 @@ class utf8_to_utf16 {
 
  public:
   FMT_API explicit utf8_to_utf16(string_view s);
-  inline operator basic_string_view<wchar_t>() const {
-    return {&buffer_[0], size()};
-  }
-  inline auto size() const -> size_t { return buffer_.size() - 1; }
-  inline auto c_str() const -> const wchar_t* { return &buffer_[0]; }
-  inline auto str() const -> std::wstring { return {&buffer_[0], size()}; }
+  operator basic_string_view<wchar_t>() const { return {&buffer_[0], size()}; }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const wchar_t* { return &buffer_[0]; }
+  auto str() const -> std::wstring { return {&buffer_[0], size()}; }
 };
 
 enum class to_utf8_error_policy { abort, replace };
@@ -1313,12 +1487,10 @@ template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
           if (policy == to_utf8_error_policy::abort) return false;
           buf.append(string_view("\xEF\xBF\xBD"));
           --p;
-          continue;
         } else {
           c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
         }
-      }
-      if (c < 0x80) {
+      } else if (c < 0x80) {
         buf.push_back(static_cast<char>(c));
       } else if (c < 0x800) {
         buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
@@ -1486,30 +1658,25 @@ template <typename Float> constexpr auto exponent_bias() -> int {
 }
 
 // Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write_exponent(int exp, OutputIt out) -> OutputIt {
+template <typename Char, typename It>
+FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It {
   FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
   if (exp < 0) {
-    *out++ = static_cast<Char>('-');
+    *it++ = static_cast<Char>('-');
     exp = -exp;
   } else {
-    *out++ = static_cast<Char>('+');
+    *it++ = static_cast<Char>('+');
   }
-  auto uexp = static_cast<uint32_t>(exp);
-  if (is_constant_evaluated()) {
-    if (uexp < 10) *out++ = '0';
-    return format_decimal<Char>(out, uexp, count_digits(uexp));
+  if (exp >= 100) {
+    const char* top = digits2(to_unsigned(exp / 100));
+    if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
+    *it++ = static_cast<Char>(top[1]);
+    exp %= 100;
   }
-  if (uexp >= 100u) {
-    const char* top = digits2(uexp / 100);
-    if (uexp >= 1000u) *out++ = static_cast<Char>(top[0]);
-    *out++ = static_cast<Char>(top[1]);
-    uexp %= 100;
-  }
-  const char* d = digits2(uexp);
-  *out++ = static_cast<Char>(d[0]);
-  *out++ = static_cast<Char>(d[1]);
-  return out;
+  const char* d = digits2(to_unsigned(exp));
+  *it++ = static_cast<Char>(d[0]);
+  *it++ = static_cast<Char>(d[1]);
+  return it;
 }
 
 // A floating-point number f * pow(2, e) where F is an unsigned type.
@@ -1610,69 +1777,67 @@ constexpr auto convert_float(T value) -> convert_float_result<T> {
   return static_cast<convert_float_result<T>>(value);
 }
 
-template <typename Char, typename OutputIt>
+template <typename OutputIt, typename Char>
 FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n,
-                                     const basic_specs& specs) -> OutputIt {
-  auto fill_size = specs.fill_size();
-  if (fill_size == 1) return detail::fill_n(it, n, specs.fill_unit<Char>());
-  if (const Char* data = specs.fill<Char>()) {
-    for (size_t i = 0; i < n; ++i) it = copy<Char>(data, data + fill_size, it);
-  }
+                                     const fill_t<Char>& fill) -> OutputIt {
+  auto fill_size = fill.size();
+  if (fill_size == 1) return detail::fill_n(it, n, fill[0]);
+  auto data = fill.data();
+  for (size_t i = 0; i < n; ++i)
+    it = copy_str<Char>(data, data + fill_size, it);
   return it;
 }
 
 // Writes the output of f, padded according to format specifications in specs.
 // size: output size in code units.
 // width: output display width in (terminal) column positions.
-template <typename Char, align default_align = align::left, typename OutputIt,
+template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
-FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs& specs,
+FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs<Char>& specs,
                                 size_t size, size_t width, F&& f) -> OutputIt {
-  static_assert(default_align == align::left || default_align == align::right,
-                "");
+  static_assert(align == align::left || align == align::right, "");
   unsigned spec_width = to_unsigned(specs.width);
   size_t padding = spec_width > width ? spec_width - width : 0;
   // Shifts are encoded as string literals because static constexpr is not
   // supported in constexpr functions.
-  auto* shifts =
-      default_align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
-  size_t left_padding = padding >> shifts[static_cast<int>(specs.align())];
+  auto* shifts = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
+  size_t left_padding = padding >> shifts[specs.align];
   size_t right_padding = padding - left_padding;
-  auto it = reserve(out, size + padding * specs.fill_size());
-  if (left_padding != 0) it = fill<Char>(it, left_padding, specs);
+  auto it = reserve(out, size + padding * specs.fill.size());
+  if (left_padding != 0) it = fill(it, left_padding, specs.fill);
   it = f(it);
-  if (right_padding != 0) it = fill<Char>(it, right_padding, specs);
+  if (right_padding != 0) it = fill(it, right_padding, specs.fill);
   return base_iterator(out, it);
 }
 
-template <typename Char, align default_align = align::left, typename OutputIt,
+template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
-constexpr auto write_padded(OutputIt out, const format_specs& specs,
+constexpr auto write_padded(OutputIt out, const format_specs<Char>& specs,
                             size_t size, F&& f) -> OutputIt {
-  return write_padded<Char, default_align>(out, specs, size, size, f);
+  return write_padded<align>(out, specs, size, size, f);
 }
 
-template <typename Char, align default_align = align::left, typename OutputIt>
+template <align::type align = align::left, typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
-                               const format_specs& specs = {}) -> OutputIt {
-  return write_padded<Char, default_align>(
+                               const format_specs<Char>& specs) -> OutputIt {
+  return write_padded<align>(
       out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
         const char* data = bytes.data();
-        return copy<Char>(data, data + bytes.size(), it);
+        return copy_str<Char>(data, data + bytes.size(), it);
       });
 }
 
 template <typename Char, typename OutputIt, typename UIntPtr>
-auto write_ptr(OutputIt out, UIntPtr value, const format_specs* specs)
+auto write_ptr(OutputIt out, UIntPtr value, const format_specs<Char>* specs)
     -> OutputIt {
   int num_digits = count_digits<4>(value);
   auto size = to_unsigned(num_digits) + size_t(2);
   auto write = [=](reserve_iterator<OutputIt> it) {
     *it++ = static_cast<Char>('0');
     *it++ = static_cast<Char>('x');
-    return format_base2e<Char>(4, it, value, num_digits);
+    return format_uint<4, Char>(it, value, num_digits);
   };
-  return specs ? write_padded<Char, align::right>(out, *specs, size, write)
+  return specs ? write_padded<align::right>(out, *specs, size, write)
                : base_iterator(out, write(reserve(out, size)));
 }
 
@@ -1680,9 +1845,8 @@ auto write_ptr(OutputIt out, UIntPtr value, const format_specs* specs)
 FMT_API auto is_printable(uint32_t cp) -> bool;
 
 inline auto needs_escape(uint32_t cp) -> bool {
-  if (cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\') return true;
-  if (const_check(FMT_OPTIMIZE_SIZE > 1)) return false;
-  return !is_printable(cp);
+  return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
+         !is_printable(cp);
 }
 
 template <typename Char> struct find_escape_result {
@@ -1691,11 +1855,17 @@ template <typename Char> struct find_escape_result {
   uint32_t cp;
 };
 
+template <typename Char>
+using make_unsigned_char =
+    typename conditional_t<std::is_integral<Char>::value,
+                           std::make_unsigned<Char>,
+                           type_identity<uint32_t>>::type;
+
 template <typename Char>
 auto find_escape(const Char* begin, const Char* end)
     -> find_escape_result<Char> {
   for (; begin != end; ++begin) {
-    uint32_t cp = static_cast<unsigned_char<Char>>(*begin);
+    uint32_t cp = static_cast<make_unsigned_char<Char>>(*begin);
     if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
     if (needs_escape(cp)) return {begin, begin + 1, cp};
   }
@@ -1704,7 +1874,7 @@ auto find_escape(const Char* begin, const Char* end)
 
 inline auto find_escape(const char* begin, const char* end)
     -> find_escape_result<char> {
-  if (const_check(!use_utf8)) return find_escape<char>(begin, end);
+  if (!is_utf8()) return find_escape<char>(begin, end);
   auto result = find_escape_result<char>{end, nullptr, 0};
   for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
                      [&](uint32_t cp, string_view sv) {
@@ -1717,14 +1887,40 @@ inline auto find_escape(const char* begin, const char* end)
   return result;
 }
 
+#define FMT_STRING_IMPL(s, base, explicit)                                    \
+  [] {                                                                        \
+    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
+    /* Use a macro-like name to avoid shadowing warnings. */                  \
+    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
+      using char_type FMT_MAYBE_UNUSED = fmt::remove_cvref_t<decltype(s[0])>; \
+      FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit                                 \
+      operator fmt::basic_string_view<char_type>() const {                    \
+        return fmt::detail_exported::compile_string_to_view<char_type>(s);    \
+      }                                                                       \
+    };                                                                        \
+    return FMT_COMPILE_STRING();                                              \
+  }()
+
+/**
+  \rst
+  Constructs a compile-time format string from a string literal *s*.
+
+  **Example**::
+
+    // A compile-time error because 'd' is an invalid specifier for strings.
+    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+  \endrst
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, )
+
 template <size_t width, typename Char, typename OutputIt>
 auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
   *out++ = static_cast<Char>('\\');
   *out++ = static_cast<Char>(prefix);
   Char buf[width];
   fill_n(buf, width, static_cast<Char>('0'));
-  format_base2e(4, buf, cp, width);
-  return copy<Char>(buf, buf + width, out);
+  format_uint<4>(buf, cp, width);
+  return copy_str<Char>(buf, buf + width, out);
 }
 
 template <typename OutputIt, typename Char>
@@ -1744,9 +1940,13 @@ auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
     *out++ = static_cast<Char>('\\');
     c = static_cast<Char>('t');
     break;
-  case '"':  FMT_FALLTHROUGH;
-  case '\'': FMT_FALLTHROUGH;
-  case '\\': *out++ = static_cast<Char>('\\'); break;
+  case '"':
+    FMT_FALLTHROUGH;
+  case '\'':
+    FMT_FALLTHROUGH;
+  case '\\':
+    *out++ = static_cast<Char>('\\');
+    break;
   default:
     if (escape.cp < 0x100) return write_codepoint<2, Char>(out, 'x', escape.cp);
     if (escape.cp < 0x10000)
@@ -1771,7 +1971,7 @@ auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
   auto begin = str.begin(), end = str.end();
   do {
     auto escape = find_escape(begin, end);
-    out = copy<Char>(begin, escape.begin, out);
+    out = copy_str<Char>(begin, escape.begin, out);
     begin = escape.end;
     if (!begin) break;
     out = write_escaped_cp<OutputIt, Char>(out, escape);
@@ -1798,23 +1998,74 @@ auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
 
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
-                              const format_specs& specs) -> OutputIt {
-  bool is_debug = specs.type() == presentation_type::debug;
-  return write_padded<Char>(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+                              const format_specs<Char>& specs) -> OutputIt {
+  bool is_debug = specs.type == presentation_type::debug;
+  return write_padded(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
     if (is_debug) return write_escaped_char(it, value);
     *it++ = value;
     return it;
   });
 }
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, Char value, const format_specs& specs,
-                         locale_ref loc = {}) -> OutputIt {
+FMT_CONSTEXPR auto write(OutputIt out, Char value,
+                         const format_specs<Char>& specs, locale_ref loc = {})
+    -> OutputIt {
   // char is formatted as unsigned char for consistency across platforms.
   using unsigned_type =
       conditional_t<std::is_same<Char, char>::value, unsigned char, unsigned>;
   return check_char_specs(specs)
-             ? write_char<Char>(out, value, specs)
-             : write<Char>(out, static_cast<unsigned_type>(value), specs, loc);
+             ? write_char(out, value, specs)
+             : write(out, static_cast<unsigned_type>(value), specs, loc);
+}
+
+// Data for write_int that doesn't depend on output iterator type. It is used to
+// avoid template code bloat.
+template <typename Char> struct write_int_data {
+  size_t size;
+  size_t padding;
+
+  FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix,
+                               const format_specs<Char>& specs)
+      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
+    if (specs.align == align::numeric) {
+      auto width = to_unsigned(specs.width);
+      if (width > size) {
+        padding = width - size;
+        size = width;
+      }
+    } else if (specs.precision > num_digits) {
+      size = (prefix >> 24) + to_unsigned(specs.precision);
+      padding = to_unsigned(specs.precision - num_digits);
+    }
+  }
+};
+
+// Writes an integer in the format
+//   <left-padding><prefix><numeric-padding><digits><right-padding>
+// where <digits> are written by write_digits(it).
+// prefix contains chars in three lower bytes and the size in the fourth byte.
+template <typename OutputIt, typename Char, typename W>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
+                                        unsigned prefix,
+                                        const format_specs<Char>& specs,
+                                        W write_digits) -> OutputIt {
+  // Slightly faster check for specs.width == 0 && specs.precision == -1.
+  if ((specs.width | (specs.precision + 1)) == 0) {
+    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
+    if (prefix != 0) {
+      for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+        *it++ = static_cast<Char>(p & 0xff);
+    }
+    return base_iterator(out, write_digits(it));
+  }
+  auto data = write_int_data<Char>(num_digits, prefix, specs);
+  return write_padded<align::right>(
+      out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        it = detail::fill_n(it, data.padding, static_cast<Char>('0'));
+        return write_digits(it);
+      });
 }
 
 template <typename Char> class digit_grouping {
@@ -1839,9 +2090,7 @@ template <typename Char> class digit_grouping {
   }
 
  public:
-  template <typename Locale,
-            FMT_ENABLE_IF(std::is_same<Locale, locale_ref>::value)>
-  explicit digit_grouping(Locale loc, bool localized = true) {
+  explicit digit_grouping(locale_ref loc, bool localized = true) {
     if (!localized) return;
     auto sep = thousands_sep<Char>(loc);
     grouping_ = sep.grouping;
@@ -1873,8 +2122,9 @@ template <typename Char> class digit_grouping {
     for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
          i < num_digits; ++i) {
       if (num_digits - i == separators[sep_index]) {
-        out = copy<Char>(thousands_sep_.data(),
-                         thousands_sep_.data() + thousands_sep_.size(), out);
+        out =
+            copy_str<Char>(thousands_sep_.data(),
+                           thousands_sep_.data() + thousands_sep_.size(), out);
         --sep_index;
       }
       *out++ = static_cast<Char>(digits[to_unsigned(i)]);
@@ -1891,45 +2141,54 @@ FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
 // Writes a decimal integer with digit grouping.
 template <typename OutputIt, typename UInt, typename Char>
 auto write_int(OutputIt out, UInt value, unsigned prefix,
-               const format_specs& specs, const digit_grouping<Char>& grouping)
-    -> OutputIt {
+               const format_specs<Char>& specs,
+               const digit_grouping<Char>& grouping) -> OutputIt {
   static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
   int num_digits = 0;
   auto buffer = memory_buffer();
-  switch (specs.type()) {
-  default: FMT_ASSERT(false, ""); FMT_FALLTHROUGH;
+  switch (specs.type) {
   case presentation_type::none:
-  case presentation_type::dec:
+  case presentation_type::dec: {
     num_digits = count_digits(value);
     format_decimal<char>(appender(buffer), value, num_digits);
     break;
-  case presentation_type::hex:
-    if (specs.alt())
-      prefix_append(prefix, unsigned(specs.upper() ? 'X' : 'x') << 8 | '0');
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
     num_digits = count_digits<4>(value);
-    format_base2e<char>(4, appender(buffer), value, num_digits, specs.upper());
+    format_uint<4, char>(appender(buffer), value, num_digits, upper);
     break;
-  case presentation_type::oct:
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_uint<1, char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::oct: {
     num_digits = count_digits<3>(value);
     // Octal prefix '0' is counted as a digit, so only add it if precision
     // is not greater than the number of digits.
-    if (specs.alt() && specs.precision <= num_digits && value != 0)
+    if (specs.alt && specs.precision <= num_digits && value != 0)
       prefix_append(prefix, '0');
-    format_base2e<char>(3, appender(buffer), value, num_digits);
-    break;
-  case presentation_type::bin:
-    if (specs.alt())
-      prefix_append(prefix, unsigned(specs.upper() ? 'B' : 'b') << 8 | '0');
-    num_digits = count_digits<1>(value);
-    format_base2e<char>(1, appender(buffer), value, num_digits);
+    format_uint<3, char>(appender(buffer), value, num_digits);
     break;
+  }
   case presentation_type::chr:
-    return write_char<Char>(out, static_cast<Char>(value), specs);
+    return write_char(out, static_cast<Char>(value), specs);
+  default:
+    throw_format_error("invalid format specifier");
   }
 
   unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
                   to_unsigned(grouping.count_separators(num_digits));
-  return write_padded<Char, align::right>(
+  return write_padded<align::right>(
       out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
         for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
           *it++ = static_cast<Char>(p & 0xff);
@@ -1937,13 +2196,11 @@ auto write_int(OutputIt out, UInt value, unsigned prefix,
       });
 }
 
-#if FMT_USE_LOCALE
 // Writes a localized value.
-FMT_API auto write_loc(appender out, loc_value value, const format_specs& specs,
-                       locale_ref loc) -> bool;
-#endif
-template <typename OutputIt>
-inline auto write_loc(OutputIt, const loc_value&, const format_specs&,
+FMT_API auto write_loc(appender out, loc_value value,
+                       const format_specs<>& specs, locale_ref loc) -> bool;
+template <typename OutputIt, typename Char>
+inline auto write_loc(OutputIt, loc_value, const format_specs<Char>&,
                       locale_ref) -> bool {
   return false;
 }
@@ -1954,7 +2211,7 @@ template <typename UInt> struct write_int_arg {
 };
 
 template <typename T>
-FMT_CONSTEXPR auto make_write_int_arg(T value, sign s)
+FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
     -> write_int_arg<uint32_or_64_or_128_t<T>> {
   auto prefix = 0u;
   auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
@@ -1964,21 +2221,21 @@ FMT_CONSTEXPR auto make_write_int_arg(T value, sign s)
   } else {
     constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
                                             0x1000000u | ' '};
-    prefix = prefixes[static_cast<int>(s)];
+    prefix = prefixes[sign];
   }
   return {abs_value, prefix};
 }
 
 template <typename Char = char> struct loc_writer {
-  basic_appender<Char> out;
-  const format_specs& specs;
+  buffer_appender<Char> out;
+  const format_specs<Char>& specs;
   std::basic_string<Char> sep;
   std::string grouping;
   std::basic_string<Char> decimal_point;
 
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   auto operator()(T value) -> bool {
-    auto arg = make_write_int_arg(value, specs.sign());
+    auto arg = make_write_int_arg(value, specs.sign);
     write_int(out, static_cast<uint64_or_128_t<T>>(arg.abs_value), arg.prefix,
               specs, digit_grouping<Char>(grouping, sep));
     return true;
@@ -1990,162 +2247,167 @@ template <typename Char = char> struct loc_writer {
   }
 };
 
-// Size and padding computation separate from write_int to avoid template bloat.
-struct size_padding {
-  unsigned size;
-  unsigned padding;
-
-  FMT_CONSTEXPR size_padding(int num_digits, unsigned prefix,
-                             const format_specs& specs)
-      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
-    if (specs.align() == align::numeric) {
-      auto width = to_unsigned(specs.width);
-      if (width > size) {
-        padding = width - size;
-        size = width;
-      }
-    } else if (specs.precision > num_digits) {
-      size = (prefix >> 24) + to_unsigned(specs.precision);
-      padding = to_unsigned(specs.precision - num_digits);
-    }
-  }
-};
-
 template <typename Char, typename OutputIt, typename T>
 FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
-                                        const format_specs& specs) -> OutputIt {
+                                        const format_specs<Char>& specs,
+                                        locale_ref) -> OutputIt {
   static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
-
-  constexpr int buffer_size = num_bits<T>();
-  char buffer[buffer_size];
-  if (is_constant_evaluated()) fill_n(buffer, buffer_size, '\0');
-  const char* begin = nullptr;
-  const char* end = buffer + buffer_size;
-
   auto abs_value = arg.abs_value;
   auto prefix = arg.prefix;
-  switch (specs.type()) {
-  default: FMT_ASSERT(false, ""); FMT_FALLTHROUGH;
+  switch (specs.type) {
   case presentation_type::none:
-  case presentation_type::dec:
-    begin = do_format_decimal(buffer, abs_value, buffer_size);
-    break;
-  case presentation_type::hex:
-    begin = do_format_base2e(4, buffer, abs_value, buffer_size, specs.upper());
-    if (specs.alt())
-      prefix_append(prefix, unsigned(specs.upper() ? 'X' : 'x') << 8 | '0');
-    break;
+  case presentation_type::dec: {
+    auto num_digits = count_digits(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_decimal<Char>(it, abs_value, num_digits).end;
+        });
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+    int num_digits = count_digits<4>(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<4, Char>(it, abs_value, num_digits, upper);
+        });
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    int num_digits = count_digits<1>(abs_value);
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<1, Char>(it, abs_value, num_digits);
+                     });
+  }
   case presentation_type::oct: {
-    begin = do_format_base2e(3, buffer, abs_value, buffer_size);
+    int num_digits = count_digits<3>(abs_value);
     // Octal prefix '0' is counted as a digit, so only add it if precision
     // is not greater than the number of digits.
-    auto num_digits = end - begin;
-    if (specs.alt() && specs.precision <= num_digits && abs_value != 0)
+    if (specs.alt && specs.precision <= num_digits && abs_value != 0)
       prefix_append(prefix, '0');
-    break;
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<3, Char>(it, abs_value, num_digits);
+                     });
   }
-  case presentation_type::bin:
-    begin = do_format_base2e(1, buffer, abs_value, buffer_size);
-    if (specs.alt())
-      prefix_append(prefix, unsigned(specs.upper() ? 'B' : 'b') << 8 | '0');
-    break;
   case presentation_type::chr:
-    return write_char<Char>(out, static_cast<Char>(abs_value), specs);
+    return write_char(out, static_cast<Char>(abs_value), specs);
+  default:
+    throw_format_error("invalid format specifier");
   }
-
-  // Write an integer in the format
-  //   <left-padding><prefix><numeric-padding><digits><right-padding>
-  // prefix contains chars in three lower bytes and the size in the fourth byte.
-  int num_digits = static_cast<int>(end - begin);
-  // Slightly faster check for specs.width == 0 && specs.precision == -1.
-  if ((specs.width | (specs.precision + 1)) == 0) {
-    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
-    for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
-      *it++ = static_cast<Char>(p & 0xff);
-    return base_iterator(out, copy<Char>(begin, end, it));
-  }
-  auto sp = size_padding(num_digits, prefix, specs);
-  unsigned padding = sp.padding;
-  return write_padded<Char, align::right>(
-      out, specs, sp.size, [=](reserve_iterator<OutputIt> it) {
-        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
-          *it++ = static_cast<Char>(p & 0xff);
-        it = detail::fill_n(it, padding, static_cast<Char>('0'));
-        return copy<Char>(begin, end, it);
-      });
+  return out;
 }
-
 template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(OutputIt out,
-                                                   write_int_arg<T> arg,
-                                                   const format_specs& specs)
-    -> OutputIt {
-  return write_int<Char>(out, arg, specs);
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(
+    OutputIt out, write_int_arg<T> arg, const format_specs<Char>& specs,
+    locale_ref loc) -> OutputIt {
+  return write_int(out, arg, specs, loc);
 }
-
-template <typename Char, typename T,
+template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
-                        !std::is_same<T, Char>::value)>
-FMT_CONSTEXPR FMT_INLINE auto write(basic_appender<Char> out, T value,
-                                    const format_specs& specs, locale_ref loc)
-    -> basic_appender<Char> {
-  if (specs.localized() && write_loc(out, value, specs, loc)) return out;
-  return write_int_noinline<Char>(out, make_write_int_arg(value, specs.sign()),
-                                  specs);
+                        std::is_same<OutputIt, buffer_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int_noinline(out, make_write_int_arg(value, specs.sign), specs,
+                            loc);
 }
-
 // An inlined version of write used in format string compilation.
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
-                        !std::is_same<T, Char>::value &&
-                        !std::is_same<OutputIt, basic_appender<Char>>::value)>
+                        !std::is_same<OutputIt, buffer_appender<Char>>::value)>
 FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
-                                    const format_specs& specs, locale_ref loc)
-    -> OutputIt {
-  if (specs.localized() && write_loc(out, value, specs, loc)) return out;
-  return write_int<Char>(out, make_write_int_arg(value, specs.sign()), specs);
+                                    const format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int(out, make_write_int_arg(value, specs.sign), specs, loc);
 }
 
+// An output iterator that counts the number of objects written to it and
+// discards them.
+class counting_iterator {
+ private:
+  size_t count_;
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  FMT_UNCHECKED_ITERATOR(counting_iterator);
+
+  struct value_type {
+    template <typename T> FMT_CONSTEXPR void operator=(const T&) {}
+  };
+
+  FMT_CONSTEXPR counting_iterator() : count_(0) {}
+
+  FMT_CONSTEXPR auto count() const -> size_t { return count_; }
+
+  FMT_CONSTEXPR auto operator++() -> counting_iterator& {
+    ++count_;
+    return *this;
+  }
+  FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
+      -> counting_iterator {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
+  FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
+};
+
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
-                         const format_specs& specs) -> OutputIt {
+                         const format_specs<Char>& specs) -> OutputIt {
   auto data = s.data();
   auto size = s.size();
   if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
     size = code_point_index(s, to_unsigned(specs.precision));
-
-  bool is_debug = specs.type() == presentation_type::debug;
-  if (is_debug) {
-    auto buf = counting_buffer<Char>();
-    write_escaped_string(basic_appender<Char>(buf), s);
-    size = buf.count();
-  }
-
+  bool is_debug = specs.type == presentation_type::debug;
   size_t width = 0;
   if (specs.width != 0) {
-    width =
-        is_debug ? size : compute_width(basic_string_view<Char>(data, size));
+    if (is_debug)
+      width = write_escaped_string(counting_iterator{}, s).count();
+    else
+      width = compute_width(basic_string_view<Char>(data, size));
   }
-  return write_padded<Char>(
-      out, specs, size, width, [=](reserve_iterator<OutputIt> it) {
-        return is_debug ? write_escaped_string(it, s)
-                        : copy<Char>(data, data + size, it);
-      });
+  return write_padded(out, specs, size, width,
+                      [=](reserve_iterator<OutputIt> it) {
+                        if (is_debug) return write_escaped_string(it, s);
+                        return copy_str<Char>(data, data + size, it);
+                      });
 }
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
-                         const format_specs& specs, locale_ref) -> OutputIt {
-  return write<Char>(out, s, specs);
+FMT_CONSTEXPR auto write(OutputIt out,
+                         basic_string_view<type_identity_t<Char>> s,
+                         const format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  return write(out, s, specs);
 }
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, const Char* s, const format_specs& specs,
-                         locale_ref) -> OutputIt {
-  if (specs.type() == presentation_type::pointer)
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
+                         const format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  if (specs.type == presentation_type::pointer)
     return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
-  if (!s) report_error("string pointer is null");
-  return write<Char>(out, basic_string_view<Char>(s), specs, {});
+  if (!s) throw_format_error("string pointer is null");
+  return write(out, basic_string_view<Char>(s), specs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
@@ -2159,37 +2421,46 @@ FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
   if (negative) abs_value = ~abs_value + 1;
   int num_digits = count_digits(abs_value);
   auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
-  if (auto ptr = to_pointer<Char>(out, size)) {
+  auto it = reserve(out, size);
+  if (auto ptr = to_pointer<Char>(it, size)) {
     if (negative) *ptr++ = static_cast<Char>('-');
     format_decimal<Char>(ptr, abs_value, num_digits);
     return out;
   }
-  if (negative) *out++ = static_cast<Char>('-');
-  return format_decimal<Char>(out, abs_value, num_digits);
+  if (negative) *it++ = static_cast<Char>('-');
+  it = format_decimal<Char>(it, abs_value, num_digits).end;
+  return base_iterator(out, it);
 }
 
+// DEPRECATED!
 template <typename Char>
 FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
-                               format_specs& specs) -> const Char* {
+                               format_specs<Char>& specs) -> const Char* {
   FMT_ASSERT(begin != end, "");
-  auto alignment = align::none;
+  auto align = align::none;
   auto p = begin + code_point_length(begin);
   if (end - p <= 0) p = begin;
   for (;;) {
     switch (to_ascii(*p)) {
-    case '<': alignment = align::left; break;
-    case '>': alignment = align::right; break;
-    case '^': alignment = align::center; break;
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+    case '^':
+      align = align::center;
+      break;
     }
-    if (alignment != align::none) {
+    if (align != align::none) {
       if (p != begin) {
         auto c = *begin;
         if (c == '}') return begin;
         if (c == '{') {
-          report_error("invalid fill character '{'");
+          throw_format_error("invalid fill character '{'");
           return begin;
         }
-        specs.set_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
+        specs.fill = {begin, to_unsigned(p - begin)};
         begin = p + 1;
       } else {
         ++begin;
@@ -2200,27 +2471,88 @@ FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
     }
     p = begin;
   }
-  specs.set_align(alignment);
+  specs.align = align;
   return begin;
 }
 
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
+  hex
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool upper : 1;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool showpoint : 1;
+};
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char>& specs)
+    -> float_specs {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  result.locale = specs.localized;
+  switch (specs.type) {
+  case presentation_type::none:
+    result.format = float_format::general;
+    break;
+  case presentation_type::general_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::general_lower:
+    result.format = float_format::general;
+    break;
+  case presentation_type::exp_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::exp_lower:
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::fixed_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::fixed_lower:
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::hexfloat_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::hexfloat_lower:
+    result.format = float_format::hex;
+    break;
+  default:
+    throw_format_error("invalid format specifier");
+    break;
+  }
+  return result;
+}
+
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
-                                     format_specs specs, sign s) -> OutputIt {
+                                     format_specs<Char> specs,
+                                     const float_specs& fspecs) -> OutputIt {
   auto str =
-      isnan ? (specs.upper() ? "NAN" : "nan") : (specs.upper() ? "INF" : "inf");
+      isnan ? (fspecs.upper ? "NAN" : "nan") : (fspecs.upper ? "INF" : "inf");
   constexpr size_t str_size = 3;
-  auto size = str_size + (s != sign::none ? 1 : 0);
+  auto sign = fspecs.sign;
+  auto size = str_size + (sign ? 1 : 0);
   // Replace '0'-padding with space for non-finite values.
   const bool is_zero_fill =
-      specs.fill_size() == 1 && specs.fill_unit<Char>() == '0';
-  if (is_zero_fill) specs.set_fill(' ');
-  return write_padded<Char>(out, specs, size,
-                            [=](reserve_iterator<OutputIt> it) {
-                              if (s != sign::none)
-                                *it++ = detail::getsign<Char>(s);
-                              return copy<Char>(str, str + str_size, it);
-                            });
+      specs.fill.size() == 1 && *specs.fill.data() == static_cast<Char>('0');
+  if (is_zero_fill) specs.fill[0] = static_cast<Char>(' ');
+  return write_padded(out, specs, size, [=](reserve_iterator<OutputIt> it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
+    return copy_str<Char>(str, str + str_size, it);
+  });
 }
 
 // A decimal floating-point number significand * pow(10, exp).
@@ -2241,12 +2573,12 @@ inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
 template <typename Char, typename OutputIt>
 constexpr auto write_significand(OutputIt out, const char* significand,
                                  int significand_size) -> OutputIt {
-  return copy<Char>(significand, significand + significand_size, out);
+  return copy_str<Char>(significand, significand + significand_size, out);
 }
 template <typename Char, typename OutputIt, typename UInt>
 inline auto write_significand(OutputIt out, UInt significand,
                               int significand_size) -> OutputIt {
-  return format_decimal<Char>(out, significand, significand_size);
+  return format_decimal<Char>(out, significand, significand_size).end;
 }
 template <typename Char, typename OutputIt, typename T, typename Grouping>
 FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
@@ -2266,13 +2598,14 @@ template <typename Char, typename UInt,
           FMT_ENABLE_IF(std::is_integral<UInt>::value)>
 inline auto write_significand(Char* out, UInt significand, int significand_size,
                               int integral_size, Char decimal_point) -> Char* {
-  if (!decimal_point) return format_decimal(out, significand, significand_size);
+  if (!decimal_point)
+    return format_decimal(out, significand, significand_size).end;
   out += significand_size + 1;
   Char* end = out;
   int floating_size = significand_size - integral_size;
   for (int i = floating_size / 2; i > 0; --i) {
     out -= 2;
-    write2digits(out, static_cast<std::size_t>(significand % 100));
+    copy2(out, digits2(static_cast<std::size_t>(significand % 100)));
     significand /= 100;
   }
   if (floating_size % 2 != 0) {
@@ -2293,19 +2626,19 @@ inline auto write_significand(OutputIt out, UInt significand,
   Char buffer[digits10<UInt>() + 2];
   auto end = write_significand(buffer, significand, significand_size,
                                integral_size, decimal_point);
-  return detail::copy_noinline<Char>(buffer, end, out);
+  return detail::copy_str_noinline<Char>(buffer, end, out);
 }
 
 template <typename OutputIt, typename Char>
 FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
                                      int significand_size, int integral_size,
                                      Char decimal_point) -> OutputIt {
-  out = detail::copy_noinline<Char>(significand, significand + integral_size,
-                                    out);
+  out = detail::copy_str_noinline<Char>(significand,
+                                        significand + integral_size, out);
   if (!decimal_point) return out;
   *out++ = decimal_point;
-  return detail::copy_noinline<Char>(significand + integral_size,
-                                     significand + significand_size, out);
+  return detail::copy_str_noinline<Char>(significand + integral_size,
+                                         significand + significand_size, out);
 }
 
 template <typename OutputIt, typename Char, typename T, typename Grouping>
@@ -2318,42 +2651,44 @@ FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
                              decimal_point);
   }
   auto buffer = basic_memory_buffer<Char>();
-  write_significand(basic_appender<Char>(buffer), significand, significand_size,
-                    integral_size, decimal_point);
+  write_significand(buffer_appender<Char>(buffer), significand,
+                    significand_size, integral_size, decimal_point);
   grouping.apply(
       out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
-  return detail::copy_noinline<Char>(buffer.data() + integral_size,
-                                     buffer.end(), out);
+  return detail::copy_str_noinline<Char>(buffer.data() + integral_size,
+                                         buffer.end(), out);
 }
 
-template <typename Char, typename OutputIt, typename DecimalFP,
+template <typename OutputIt, typename DecimalFP, typename Char,
           typename Grouping = digit_grouping<Char>>
 FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
-                                    const format_specs& specs, sign s,
-                                    locale_ref loc) -> OutputIt {
+                                    const format_specs<Char>& specs,
+                                    float_specs fspecs, locale_ref loc)
+    -> OutputIt {
   auto significand = f.significand;
   int significand_size = get_significand_size(f);
   const Char zero = static_cast<Char>('0');
-  size_t size = to_unsigned(significand_size) + (s != sign::none ? 1 : 0);
+  auto sign = fspecs.sign;
+  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
   using iterator = reserve_iterator<OutputIt>;
 
-  Char decimal_point = specs.localized() ? detail::decimal_point<Char>(loc)
-                                         : static_cast<Char>('.');
+  Char decimal_point =
+      fspecs.locale ? detail::decimal_point<Char>(loc) : static_cast<Char>('.');
 
   int output_exp = f.exponent + significand_size - 1;
   auto use_exp_format = [=]() {
-    if (specs.type() == presentation_type::exp) return true;
-    if (specs.type() == presentation_type::fixed) return false;
+    if (fspecs.format == float_format::exp) return true;
+    if (fspecs.format != float_format::general) return false;
     // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
     // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
     const int exp_lower = -4, exp_upper = 16;
     return output_exp < exp_lower ||
-           output_exp >= (specs.precision > 0 ? specs.precision : exp_upper);
+           output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
   };
   if (use_exp_format()) {
     int num_zeros = 0;
-    if (specs.alt()) {
-      num_zeros = specs.precision - significand_size;
+    if (fspecs.showpoint) {
+      num_zeros = fspecs.precision - significand_size;
       if (num_zeros < 0) num_zeros = 0;
       size += to_unsigned(num_zeros);
     } else if (significand_size == 1) {
@@ -2364,9 +2699,9 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
     if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
 
     size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
-    char exp_char = specs.upper() ? 'E' : 'e';
+    char exp_char = fspecs.upper ? 'E' : 'e';
     auto write = [=](iterator it) {
-      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+      if (sign) *it++ = detail::sign<Char>(sign);
       // Insert a decimal point after the first digit and add an exponent.
       it = write_significand(it, significand, significand_size, 1,
                              decimal_point);
@@ -2374,41 +2709,39 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
       *it++ = static_cast<Char>(exp_char);
       return write_exponent<Char>(output_exp, it);
     };
-    return specs.width > 0
-               ? write_padded<Char, align::right>(out, specs, size, write)
-               : base_iterator(out, write(reserve(out, size)));
+    return specs.width > 0 ? write_padded<align::right>(out, specs, size, write)
+                           : base_iterator(out, write(reserve(out, size)));
   }
 
   int exp = f.exponent + significand_size;
   if (f.exponent >= 0) {
     // 1234e5 -> 123400000[.0+]
     size += to_unsigned(f.exponent);
-    int num_zeros = specs.precision - exp;
+    int num_zeros = fspecs.precision - exp;
     abort_fuzzing_if(num_zeros > 5000);
-    if (specs.alt()) {
+    if (fspecs.showpoint) {
       ++size;
-      if (num_zeros <= 0 && specs.type() != presentation_type::fixed)
-        num_zeros = 0;
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 0;
       if (num_zeros > 0) size += to_unsigned(num_zeros);
     }
-    auto grouping = Grouping(loc, specs.localized());
+    auto grouping = Grouping(loc, fspecs.locale);
     size += to_unsigned(grouping.count_separators(exp));
-    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
-      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
       it = write_significand<Char>(it, significand, significand_size,
                                    f.exponent, grouping);
-      if (!specs.alt()) return it;
+      if (!fspecs.showpoint) return it;
       *it++ = decimal_point;
       return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
     });
   } else if (exp > 0) {
     // 1234e-2 -> 12.34[0+]
-    int num_zeros = specs.alt() ? specs.precision - significand_size : 0;
-    size += 1 + static_cast<unsigned>(max_of(num_zeros, 0));
-    auto grouping = Grouping(loc, specs.localized());
+    int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
+    size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    auto grouping = Grouping(loc, fspecs.locale);
     size += to_unsigned(grouping.count_separators(exp));
-    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
-      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
       it = write_significand(it, significand, significand_size, exp,
                              decimal_point, grouping);
       return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
@@ -2416,14 +2749,14 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
   }
   // 1234e-6 -> 0.001234
   int num_zeros = -exp;
-  if (significand_size == 0 && specs.precision >= 0 &&
-      specs.precision < num_zeros) {
-    num_zeros = specs.precision;
+  if (significand_size == 0 && fspecs.precision >= 0 &&
+      fspecs.precision < num_zeros) {
+    num_zeros = fspecs.precision;
   }
-  bool pointy = num_zeros != 0 || significand_size != 0 || specs.alt();
+  bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
   size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
-  return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
-    if (s != sign::none) *it++ = detail::getsign<Char>(s);
+  return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
     *it++ = zero;
     if (!pointy) return it;
     *it++ = decimal_point;
@@ -2446,20 +2779,22 @@ template <typename Char> class fallback_digit_grouping {
   }
 };
 
-template <typename Char, typename OutputIt, typename DecimalFP>
+template <typename OutputIt, typename DecimalFP, typename Char>
 FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
-                                 const format_specs& specs, sign s,
-                                 locale_ref loc) -> OutputIt {
+                                 const format_specs<Char>& specs,
+                                 float_specs fspecs, locale_ref loc)
+    -> OutputIt {
   if (is_constant_evaluated()) {
-    return do_write_float<Char, OutputIt, DecimalFP,
-                          fallback_digit_grouping<Char>>(out, f, specs, s, loc);
+    return do_write_float<OutputIt, DecimalFP, Char,
+                          fallback_digit_grouping<Char>>(out, f, specs, fspecs,
+                                                         loc);
   } else {
-    return do_write_float<Char>(out, f, specs, s, loc);
+    return do_write_float(out, f, specs, fspecs, loc);
   }
 }
 
 template <typename T> constexpr auto isnan(T value) -> bool {
-  return value != value;  // std::isnan doesn't support __float128.
+  return !(value >= value);  // std::isnan doesn't support __float128.
 }
 
 template <typename T, typename Enable = void>
@@ -2507,48 +2842,52 @@ inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
 
 class bigint {
  private:
-  // A bigint is a number in the form bigit_[N - 1] ... bigit_[0] * 32^exp_.
-  using bigit = uint32_t;  // A big digit.
+  // A bigint is stored as an array of bigits (big digits), with bigit at index
+  // 0 being the least significant one.
+  using bigit = uint32_t;
   using double_bigit = uint64_t;
-  enum { bigit_bits = num_bits<bigit>() };
   enum { bigits_capacity = 32 };
   basic_memory_buffer<bigit, bigits_capacity> bigits_;
   int exp_;
 
-  friend struct formatter<bigint>;
-
-  FMT_CONSTEXPR auto get_bigit(int i) const -> bigit {
-    return i >= exp_ && i < num_bigits() ? bigits_[i - exp_] : 0;
+  FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
+    return bigits_[to_unsigned(index)];
+  }
+  FMT_CONSTEXPR20 auto operator[](int index) -> bigit& {
+    return bigits_[to_unsigned(index)];
   }
 
-  FMT_CONSTEXPR void subtract_bigits(int index, bigit other, bigit& borrow) {
-    auto result = double_bigit(bigits_[index]) - other - borrow;
-    bigits_[index] = static_cast<bigit>(result);
+  static constexpr const int bigit_bits = num_bits<bigit>();
+
+  friend struct formatter<bigint>;
+
+  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
+    (*this)[index] = static_cast<bigit>(result);
     borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
   }
 
-  FMT_CONSTEXPR void remove_leading_zeros() {
+  FMT_CONSTEXPR20 void remove_leading_zeros() {
     int num_bigits = static_cast<int>(bigits_.size()) - 1;
-    while (num_bigits > 0 && bigits_[num_bigits] == 0) --num_bigits;
+    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
     bigits_.resize(to_unsigned(num_bigits + 1));
   }
 
   // Computes *this -= other assuming aligned bigints and *this >= other.
-  FMT_CONSTEXPR void subtract_aligned(const bigint& other) {
+  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
     FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
     FMT_ASSERT(compare(*this, other) >= 0, "");
     bigit borrow = 0;
     int i = other.exp_ - exp_;
     for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
       subtract_bigits(i, other.bigits_[j], borrow);
-    if (borrow != 0) subtract_bigits(i, 0, borrow);
-    FMT_ASSERT(borrow == 0, "");
+    while (borrow > 0) subtract_bigits(i, 0, borrow);
     remove_leading_zeros();
   }
 
-  FMT_CONSTEXPR void multiply(uint32_t value) {
-    bigit carry = 0;
+  FMT_CONSTEXPR20 void multiply(uint32_t value) {
     const double_bigit wide_value = value;
+    bigit carry = 0;
     for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
       double_bigit result = bigits_[i] * wide_value + carry;
       bigits_[i] = static_cast<bigit>(result);
@@ -2559,7 +2898,7 @@ class bigint {
 
   template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
                                          std::is_same<UInt, uint128_t>::value)>
-  FMT_CONSTEXPR void multiply(UInt value) {
+  FMT_CONSTEXPR20 void multiply(UInt value) {
     using half_uint =
         conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
     const int shift = num_bits<half_uint>() - bigit_bits;
@@ -2580,7 +2919,7 @@ class bigint {
 
   template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
                                          std::is_same<UInt, uint128_t>::value)>
-  FMT_CONSTEXPR void assign(UInt n) {
+  FMT_CONSTEXPR20 void assign(UInt n) {
     size_t num_bigits = 0;
     do {
       bigits_[num_bigits++] = static_cast<bigit>(n);
@@ -2591,30 +2930,30 @@ class bigint {
   }
 
  public:
-  FMT_CONSTEXPR bigint() : exp_(0) {}
+  FMT_CONSTEXPR20 bigint() : exp_(0) {}
   explicit bigint(uint64_t n) { assign(n); }
 
   bigint(const bigint&) = delete;
   void operator=(const bigint&) = delete;
 
-  FMT_CONSTEXPR void assign(const bigint& other) {
+  FMT_CONSTEXPR20 void assign(const bigint& other) {
     auto size = other.bigits_.size();
     bigits_.resize(size);
     auto data = other.bigits_.data();
-    copy<bigit>(data, data + size, bigits_.data());
+    copy_str<bigit>(data, data + size, bigits_.data());
     exp_ = other.exp_;
   }
 
-  template <typename Int> FMT_CONSTEXPR void operator=(Int n) {
+  template <typename Int> FMT_CONSTEXPR20 void operator=(Int n) {
     FMT_ASSERT(n > 0, "");
     assign(uint64_or_128_t<Int>(n));
   }
 
-  FMT_CONSTEXPR auto num_bigits() const -> int {
+  FMT_CONSTEXPR20 auto num_bigits() const -> int {
     return static_cast<int>(bigits_.size()) + exp_;
   }
 
-  FMT_CONSTEXPR auto operator<<=(int shift) -> bigint& {
+  FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint& {
     FMT_ASSERT(shift >= 0, "");
     exp_ += shift / bigit_bits;
     shift %= bigit_bits;
@@ -2629,39 +2968,49 @@ class bigint {
     return *this;
   }
 
-  template <typename Int> FMT_CONSTEXPR auto operator*=(Int value) -> bigint& {
+  template <typename Int>
+  FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint& {
     FMT_ASSERT(value > 0, "");
     multiply(uint32_or_64_or_128_t<Int>(value));
     return *this;
   }
 
-  friend FMT_CONSTEXPR auto compare(const bigint& b1, const bigint& b2) -> int {
-    int num_bigits1 = b1.num_bigits(), num_bigits2 = b2.num_bigits();
-    if (num_bigits1 != num_bigits2) return num_bigits1 > num_bigits2 ? 1 : -1;
-    int i = static_cast<int>(b1.bigits_.size()) - 1;
-    int j = static_cast<int>(b2.bigits_.size()) - 1;
+  friend FMT_CONSTEXPR20 auto compare(const bigint& lhs, const bigint& rhs)
+      -> int {
+    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
+    if (num_lhs_bigits != num_rhs_bigits)
+      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
+    int i = static_cast<int>(lhs.bigits_.size()) - 1;
+    int j = static_cast<int>(rhs.bigits_.size()) - 1;
     int end = i - j;
     if (end < 0) end = 0;
     for (; i >= end; --i, --j) {
-      bigit b1_bigit = b1.bigits_[i], b2_bigit = b2.bigits_[j];
-      if (b1_bigit != b2_bigit) return b1_bigit > b2_bigit ? 1 : -1;
+      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
+      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
     }
     if (i != j) return i > j ? 1 : -1;
     return 0;
   }
 
   // Returns compare(lhs1 + lhs2, rhs).
-  friend FMT_CONSTEXPR auto add_compare(const bigint& lhs1, const bigint& lhs2,
-                                        const bigint& rhs) -> int {
-    int max_lhs_bigits = max_of(lhs1.num_bigits(), lhs2.num_bigits());
+  friend FMT_CONSTEXPR20 auto add_compare(const bigint& lhs1,
+                                          const bigint& lhs2, const bigint& rhs)
+      -> int {
+    auto minimum = [](int a, int b) { return a < b ? a : b; };
+    auto maximum = [](int a, int b) { return a > b ? a : b; };
+    int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
     int num_rhs_bigits = rhs.num_bigits();
     if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
     if (max_lhs_bigits > num_rhs_bigits) return 1;
+    auto get_bigit = [](const bigint& n, int i) -> bigit {
+      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
+    };
     double_bigit borrow = 0;
-    int min_exp = min_of(min_of(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_);
     for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
-      double_bigit sum = double_bigit(lhs1.get_bigit(i)) + lhs2.get_bigit(i);
-      bigit rhs_bigit = rhs.get_bigit(i);
+      double_bigit sum =
+          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
+      bigit rhs_bigit = get_bigit(rhs, i);
       if (sum > rhs_bigit + borrow) return 1;
       borrow = rhs_bigit + borrow - sum;
       if (borrow > 1) return -1;
@@ -2674,8 +3023,10 @@ class bigint {
   FMT_CONSTEXPR20 void assign_pow10(int exp) {
     FMT_ASSERT(exp >= 0, "");
     if (exp == 0) return *this = 1;
-    int bitmask = 1 << (num_bits<unsigned>() -
-                        countl_zero(static_cast<uint32_t>(exp)) - 1);
+    // Find the top bit.
+    int bitmask = 1;
+    while (exp >= bitmask) bitmask <<= 1;
+    bitmask >>= 1;
     // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
     // repeated squaring and multiplication.
     *this = 5;
@@ -2699,17 +3050,17 @@ class bigint {
       // cross-product terms n[i] * n[j] such that i + j == bigit_index.
       for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
         // Most terms are multiplied twice which can be optimized in the future.
-        sum += double_bigit(n[i]) * n[j];
+        sum += static_cast<double_bigit>(n[i]) * n[j];
       }
-      bigits_[bigit_index] = static_cast<bigit>(sum);
+      (*this)[bigit_index] = static_cast<bigit>(sum);
       sum >>= num_bits<bigit>();  // Compute the carry.
     }
     // Do the same for the top half.
     for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
          ++bigit_index) {
       for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
-        sum += double_bigit(n[i++]) * n[j--];
-      bigits_[bigit_index] = static_cast<bigit>(sum);
+        sum += static_cast<double_bigit>(n[i++]) * n[j--];
+      (*this)[bigit_index] = static_cast<bigit>(sum);
       sum >>= num_bits<bigit>();
     }
     remove_leading_zeros();
@@ -2718,20 +3069,20 @@ class bigint {
 
   // If this bigint has a bigger exponent than other, adds trailing zero to make
   // exponents equal. This simplifies some operations such as subtraction.
-  FMT_CONSTEXPR void align(const bigint& other) {
+  FMT_CONSTEXPR20 void align(const bigint& other) {
     int exp_difference = exp_ - other.exp_;
     if (exp_difference <= 0) return;
     int num_bigits = static_cast<int>(bigits_.size());
     bigits_.resize(to_unsigned(num_bigits + exp_difference));
     for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
       bigits_[j] = bigits_[i];
-    memset(bigits_.data(), 0, to_unsigned(exp_difference) * sizeof(bigit));
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0u);
     exp_ -= exp_difference;
   }
 
   // Divides this bignum by divisor, assigning the remainder to this and
   // returning the quotient.
-  FMT_CONSTEXPR auto divmod_assign(const bigint& divisor) -> int {
+  FMT_CONSTEXPR20 auto divmod_assign(const bigint& divisor) -> int {
     FMT_ASSERT(this != &divisor, "");
     if (compare(*this, divisor) < 0) return 0;
     FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
@@ -2850,11 +3201,8 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
   // Generate the given number of digits.
   exp10 -= num_digits - 1;
   if (num_digits <= 0) {
-    auto digit = '0';
-    if (num_digits == 0) {
-      denominator *= 10;
-      digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
-    }
+    denominator *= 10;
+    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
     buf.push_back(digit);
     return;
   }
@@ -2891,8 +3239,8 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
 
 // Formats a floating-point number using the hexfloat format.
 template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
-FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
-                                     buffer<char>& buf) {
+FMT_CONSTEXPR20 void format_hexfloat(Float value, int precision,
+                                     float_specs specs, buffer<char>& buf) {
   // float is passed as double to reduce the number of instantiations and to
   // simplify implementation.
   static_assert(!std::is_same<Float, float>::value, "");
@@ -2902,25 +3250,26 @@ FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
   // Assume Float is in the format [sign][exponent][significand].
   using carrier_uint = typename info::carrier_uint;
 
-  const auto num_float_significand_bits = detail::num_significand_bits<Float>();
+  constexpr auto num_float_significand_bits =
+      detail::num_significand_bits<Float>();
 
   basic_fp<carrier_uint> f(value);
   f.e += num_float_significand_bits;
   if (!has_implicit_bit<Float>()) --f.e;
 
-  const auto num_fraction_bits =
+  constexpr auto num_fraction_bits =
       num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
-  const auto num_xdigits = (num_fraction_bits + 3) / 4;
+  constexpr auto num_xdigits = (num_fraction_bits + 3) / 4;
 
-  const auto leading_shift = ((num_xdigits - 1) * 4);
+  constexpr auto leading_shift = ((num_xdigits - 1) * 4);
   const auto leading_mask = carrier_uint(0xF) << leading_shift;
   const auto leading_xdigit =
       static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
   if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
 
   int print_xdigits = num_xdigits - 1;
-  if (specs.precision >= 0 && print_xdigits > specs.precision) {
-    const int shift = ((print_xdigits - specs.precision - 1) * 4);
+  if (precision >= 0 && print_xdigits > precision) {
+    const int shift = ((print_xdigits - precision - 1) * 4);
     const auto mask = carrier_uint(0xF) << shift;
     const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
 
@@ -2939,25 +3288,25 @@ FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
       }
     }
 
-    print_xdigits = specs.precision;
+    print_xdigits = precision;
   }
 
   char xdigits[num_bits<carrier_uint>() / 4];
   detail::fill_n(xdigits, sizeof(xdigits), '0');
-  format_base2e(4, xdigits, f.f, num_xdigits, specs.upper());
+  format_uint<4>(xdigits, f.f, num_xdigits, specs.upper);
 
   // Remove zero tail
   while (print_xdigits > 0 && xdigits[print_xdigits] == '0') --print_xdigits;
 
   buf.push_back('0');
-  buf.push_back(specs.upper() ? 'X' : 'x');
+  buf.push_back(specs.upper ? 'X' : 'x');
   buf.push_back(xdigits[0]);
-  if (specs.alt() || print_xdigits > 0 || print_xdigits < specs.precision)
+  if (specs.showpoint || print_xdigits > 0 || print_xdigits < precision)
     buf.push_back('.');
   buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
-  for (; print_xdigits < specs.precision; ++print_xdigits) buf.push_back('0');
+  for (; print_xdigits < precision; ++print_xdigits) buf.push_back('0');
 
-  buf.push_back(specs.upper() ? 'P' : 'p');
+  buf.push_back(specs.upper ? 'P' : 'p');
 
   uint32_t abs_e;
   if (f.e < 0) {
@@ -2971,9 +3320,9 @@ FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
 }
 
 template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
-FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
-                                     buffer<char>& buf) {
-  format_hexfloat(static_cast<double>(value), specs, buf);
+FMT_CONSTEXPR20 void format_hexfloat(Float value, int precision,
+                                     float_specs specs, buffer<char>& buf) {
+  format_hexfloat(static_cast<double>(value), precision, specs, buf);
 }
 
 constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
@@ -2988,15 +3337,15 @@ constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
 }
 
 template <typename Float>
-FMT_CONSTEXPR20 auto format_float(Float value, int precision,
-                                  const format_specs& specs, bool binary32,
+FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
                                   buffer<char>& buf) -> int {
   // float is passed as double to reduce the number of instantiations.
   static_assert(!std::is_same<Float, float>::value, "");
+  FMT_ASSERT(value >= 0, "value is negative");
   auto converted_value = convert_float(value);
 
-  const bool fixed = specs.type() == presentation_type::fixed;
-  if (value == 0) {
+  const bool fixed = specs.format == float_format::fixed;
+  if (value <= 0) {  // <= instead of == to silence a warning.
     if (precision <= 0 || !fixed) {
       buf.push_back('0');
       return 0;
@@ -3021,6 +3370,16 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
     exp = static_cast<int>(e);
     if (e > exp) ++exp;  // Compute ceil.
     dragon_flags = dragon::fixup;
+  } else if (precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(buffer_appender<char>(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(buffer_appender<char>(buf), dec.significand);
+    return dec.exponent;
   } else {
     // Extract significand bits and exponent bits.
     using info = dragonbox::float_info<double>;
@@ -3119,7 +3478,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
         uint64_t prod;
         uint32_t digits;
         bool should_round_up;
-        int number_of_digits_to_print = min_of(precision, 9);
+        int number_of_digits_to_print = precision > 9 ? 9 : precision;
 
         // Print a 9-digits subsegment, either the first or the second.
         auto print_subsegment = [&](uint32_t subsegment, char* buffer) {
@@ -3147,7 +3506,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
             // for details.
             prod = ((subsegment * static_cast<uint64_t>(450359963)) >> 20) + 1;
             digits = static_cast<uint32_t>(prod >> 32);
-            write2digits(buffer, digits);
+            copy2(buffer, digits2(digits));
             number_of_digits_printed += 2;
           }
 
@@ -3155,7 +3514,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
           while (number_of_digits_printed < number_of_digits_to_print) {
             prod = static_cast<uint32_t>(prod) * static_cast<uint64_t>(100);
             digits = static_cast<uint32_t>(prod >> 32);
-            write2digits(buffer + number_of_digits_printed, digits);
+            copy2(buffer + number_of_digits_printed, digits2(digits));
             number_of_digits_printed += 2;
           }
         };
@@ -3264,8 +3623,9 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
   }
   if (use_dragon) {
     auto f = basic_fp<uint128_t>();
-    bool is_predecessor_closer = binary32 ? f.assign(static_cast<float>(value))
-                                          : f.assign(converted_value);
+    bool is_predecessor_closer = specs.binary32
+                                     ? f.assign(static_cast<float>(value))
+                                     : f.assign(converted_value);
     if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
     if (fixed) dragon_flags |= dragon::fixed;
     // Limit precision to the maximum possible number of significant digits in
@@ -3274,7 +3634,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
     if (precision > max_double_digits) precision = max_double_digits;
     format_dragon(f, dragon_flags, precision, buf, exp);
   }
-  if (!fixed && !specs.alt()) {
+  if (!fixed && !specs.showpoint) {
     // Remove trailing zeros.
     auto num_digits = buf.size();
     while (num_digits > 0 && buf[num_digits - 1] == '0') {
@@ -3285,97 +3645,97 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision,
   }
   return exp;
 }
-
 template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, format_specs specs,
-                                 locale_ref loc) -> OutputIt {
-  // Use signbit because value < 0 is false for NaN.
-  sign s = detail::signbit(value) ? sign::minus : specs.sign();
+FMT_CONSTEXPR20 auto write_float(OutputIt out, T value,
+                                 format_specs<Char> specs, locale_ref loc)
+    -> OutputIt {
+  float_specs fspecs = parse_float_type_spec(specs);
+  fspecs.sign = specs.sign;
+  if (detail::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+    fspecs.sign = sign::minus;
+    value = -value;
+  } else if (fspecs.sign == sign::minus) {
+    fspecs.sign = sign::none;
+  }
 
   if (!detail::isfinite(value))
-    return write_nonfinite<Char>(out, detail::isnan(value), specs, s);
+    return write_nonfinite(out, detail::isnan(value), specs, fspecs);
 
-  if (specs.align() == align::numeric && s != sign::none) {
-    *out++ = detail::getsign<Char>(s);
-    s = sign::none;
+  if (specs.align == align::numeric && fspecs.sign) {
+    auto it = reserve(out, 1);
+    *it++ = detail::sign<Char>(fspecs.sign);
+    out = base_iterator(out, it);
+    fspecs.sign = sign::none;
     if (specs.width != 0) --specs.width;
   }
 
-  int precision = specs.precision;
-  if (precision < 0) {
-    if (specs.type() != presentation_type::none) {
-      precision = 6;
-    } else if (is_fast_float<T>::value && !is_constant_evaluated()) {
-      // Use Dragonbox for the shortest format.
-      using floaty = conditional_t<sizeof(T) >= sizeof(double), double, float>;
-      auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
-      return write_float<Char>(out, dec, specs, s, loc);
-    }
-  }
-
   memory_buffer buffer;
-  if (specs.type() == presentation_type::hexfloat) {
-    if (s != sign::none) buffer.push_back(detail::getsign<char>(s));
-    format_hexfloat(convert_float(value), specs, buffer);
-    return write_bytes<Char, align::right>(out, {buffer.data(), buffer.size()},
-                                           specs);
+  if (fspecs.format == float_format::hex) {
+    if (fspecs.sign) buffer.push_back(detail::sign<char>(fspecs.sign));
+    format_hexfloat(convert_float(value), specs.precision, fspecs, buffer);
+    return write_bytes<align::right>(out, {buffer.data(), buffer.size()},
+                                     specs);
   }
-
-  if (specs.type() == presentation_type::exp) {
+  int precision = specs.precision >= 0 || specs.type == presentation_type::none
+                      ? specs.precision
+                      : 6;
+  if (fspecs.format == float_format::exp) {
     if (precision == max_value<int>())
-      report_error("number is too big");
+      throw_format_error("number is too big");
     else
       ++precision;
-    if (specs.precision != 0) specs.set_alt();
-  } else if (specs.type() == presentation_type::fixed) {
-    if (specs.precision != 0) specs.set_alt();
-  } else if (precision == 0) {
+  } else if (fspecs.format != float_format::fixed && precision == 0) {
     precision = 1;
   }
-  int exp = format_float(convert_float(value), precision, specs,
-                         std::is_same<T, float>(), buffer);
-
-  specs.precision = precision;
+  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
+  int exp = format_float(convert_float(value), precision, fspecs, buffer);
+  fspecs.precision = precision;
   auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
-  return write_float<Char>(out, f, specs, s, loc);
+  return write_float(out, f, specs, fspecs, loc);
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_floating_point<T>::value)>
-FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs specs,
+FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs<Char> specs,
                            locale_ref loc = {}) -> OutputIt {
-  return specs.localized() && write_loc(out, value, specs, loc)
+  if (const_check(!is_supported_floating_point(value))) return out;
+  return specs.localized && write_loc(out, value, specs, loc)
              ? out
-             : write_float<Char>(out, value, specs, loc);
+             : write_float(out, value, specs, loc);
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_fast_float<T>::value)>
 FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
-  if (is_constant_evaluated()) return write<Char>(out, value, format_specs());
+  if (is_constant_evaluated()) return write(out, value, format_specs<Char>());
+  if (const_check(!is_supported_floating_point(value))) return out;
 
-  auto s = detail::signbit(value) ? sign::minus : sign::none;
+  auto fspecs = float_specs();
+  if (detail::signbit(value)) {
+    fspecs.sign = sign::minus;
+    value = -value;
+  }
 
-  constexpr auto specs = format_specs();
-  using floaty = conditional_t<sizeof(T) >= sizeof(double), double, float>;
+  constexpr auto specs = format_specs<Char>();
+  using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
   using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
   floaty_uint mask = exponent_mask<floaty>();
   if ((bit_cast<floaty_uint>(value) & mask) == mask)
-    return write_nonfinite<Char>(out, std::isnan(value), specs, s);
+    return write_nonfinite(out, std::isnan(value), specs, fspecs);
 
   auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
-  return write_float<Char>(out, dec, specs, s, {});
+  return write_float(out, dec, specs, fspecs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_floating_point<T>::value &&
                         !is_fast_float<T>::value)>
 inline auto write(OutputIt out, T value) -> OutputIt {
-  return write<Char>(out, value, format_specs());
+  return write(out, value, format_specs<Char>());
 }
 
 template <typename Char, typename OutputIt>
-auto write(OutputIt out, monostate, format_specs = {}, locale_ref = {})
+auto write(OutputIt out, monostate, format_specs<Char> = {}, locale_ref = {})
     -> OutputIt {
   FMT_ASSERT(false, "");
   return out;
@@ -3384,11 +3744,13 @@ auto write(OutputIt out, monostate, format_specs = {}, locale_ref = {})
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
     -> OutputIt {
-  return copy_noinline<Char>(value.begin(), value.end(), out);
+  auto it = reserve(out, value.size());
+  it = copy_str_noinline<Char>(value.begin(), value.end(), it);
+  return base_iterator(out, it);
 }
 
 template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(has_to_string_view<T>::value)>
+          FMT_ENABLE_IF(is_string<T>::value)>
 constexpr auto write(OutputIt out, const T& value) -> OutputIt {
   return write<Char>(out, to_string_view(value));
 }
@@ -3396,8 +3758,10 @@ constexpr auto write(OutputIt out, const T& value) -> OutputIt {
 // FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
 template <
     typename Char, typename OutputIt, typename T,
-    bool check = std::is_enum<T>::value && !std::is_same<T, Char>::value &&
-                 mapped_type_constant<T, Char>::value != type::custom_type,
+    bool check =
+        std::is_enum<T>::value && !std::is_same<T, Char>::value &&
+        mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value !=
+            type::custom_type,
     FMT_ENABLE_IF(check)>
 FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
   return write<Char>(out, static_cast<underlying_t<T>>(value));
@@ -3405,12 +3769,13 @@ FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_same<T, bool>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, T value, const format_specs& specs = {},
-                         locale_ref = {}) -> OutputIt {
-  return specs.type() != presentation_type::none &&
-                 specs.type() != presentation_type::string
-             ? write<Char>(out, value ? 1 : 0, specs, {})
-             : write_bytes<Char>(out, value ? "true" : "false", specs);
+FMT_CONSTEXPR auto write(OutputIt out, T value,
+                         const format_specs<Char>& specs = {}, locale_ref = {})
+    -> OutputIt {
+  return specs.type != presentation_type::none &&
+                 specs.type != presentation_type::string
+             ? write(out, value ? 1 : 0, specs, {})
+             : write_bytes(out, value ? "true" : "false", specs);
 }
 
 template <typename Char, typename OutputIt>
@@ -3423,148 +3788,168 @@ FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR20 auto write(OutputIt out, const Char* value) -> OutputIt {
   if (value) return write(out, basic_string_view<Char>(value));
-  report_error("string pointer is null");
+  throw_format_error("string pointer is null");
   return out;
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_same<T, void>::value)>
-auto write(OutputIt out, const T* value, const format_specs& specs = {},
+auto write(OutputIt out, const T* value, const format_specs<Char>& specs = {},
            locale_ref = {}) -> OutputIt {
   return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
 }
 
+// A write overload that handles implicit conversions.
 template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(mapped_type_constant<T, Char>::value ==
-                            type::custom_type &&
-                        !std::is_fundamental<T>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> OutputIt {
-  auto f = formatter<T, Char>();
-  auto parse_ctx = parse_context<Char>({});
-  f.parse(parse_ctx);
-  auto ctx = basic_format_context<OutputIt, Char>(out, {}, {});
-  return f.format(value, ctx);
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
+    std::is_class<T>::value && !is_string<T>::value &&
+        !is_floating_point<T>::value && !std::is_same<T, Char>::value &&
+        !std::is_same<T, remove_cvref_t<decltype(arg_mapper<Context>().map(
+                             value))>>::value,
+    OutputIt> {
+  return write<Char>(out, arg_mapper<Context>().map(value));
 }
 
-template <typename T>
-using is_builtin =
-    bool_constant<std::is_same<T, int>::value || FMT_BUILTIN_TYPES>;
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value)
+    -> enable_if_t<mapped_type_constant<T, Context>::value == type::custom_type,
+                   OutputIt> {
+  auto formatter = typename Context::template formatter_type<T>();
+  auto parse_ctx = typename Context::parse_context_type({});
+  formatter.parse(parse_ctx);
+  auto ctx = Context(out, {}, {});
+  return formatter.format(value, ctx);
+}
 
 // An argument visitor that formats the argument and writes it via the output
 // iterator. It's a class and not a generic lambda for compatibility with C++11.
 template <typename Char> struct default_arg_formatter {
-  using context = buffered_context<Char>;
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
 
-  basic_appender<Char> out;
+  iterator out;
+  basic_format_args<context> args;
+  locale_ref loc;
 
-  void operator()(monostate) { report_error("argument not found"); }
-
-  template <typename T, FMT_ENABLE_IF(is_builtin<T>::value)>
-  void operator()(T value) {
-    write<Char>(out, value);
+  template <typename T> auto operator()(T value) -> iterator {
+    return write<Char>(out, value);
   }
-
-  template <typename T, FMT_ENABLE_IF(!is_builtin<T>::value)>
-  void operator()(T) {
-    FMT_ASSERT(false, "");
-  }
-
-  void operator()(typename basic_format_arg<context>::handle h) {
-    // Use a null locale since the default format must be unlocalized.
-    auto parse_ctx = parse_context<Char>({});
-    auto format_ctx = context(out, {}, {});
+  auto operator()(typename basic_format_arg<context>::handle h) -> iterator {
+    basic_format_parse_context<Char> parse_ctx({});
+    context format_ctx(out, args, loc);
     h.format(parse_ctx, format_ctx);
+    return format_ctx.out();
   }
 };
 
 template <typename Char> struct arg_formatter {
-  basic_appender<Char> out;
-  const format_specs& specs;
-  FMT_NO_UNIQUE_ADDRESS locale_ref locale;
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
 
-  template <typename T, FMT_ENABLE_IF(is_builtin<T>::value)>
-  FMT_CONSTEXPR FMT_INLINE void operator()(T value) {
-    detail::write<Char>(out, value, specs, locale);
+  iterator out;
+  const format_specs<Char>& specs;
+  locale_ref locale;
+
+  template <typename T>
+  FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
+    return detail::write(out, value, specs, locale);
   }
-
-  template <typename T, FMT_ENABLE_IF(!is_builtin<T>::value)>
-  void operator()(T) {
-    FMT_ASSERT(false, "");
-  }
-
-  void operator()(typename basic_format_arg<buffered_context<Char>>::handle) {
+  auto operator()(typename basic_format_arg<context>::handle) -> iterator {
     // User-defined types are handled separately because they require access
     // to the parse context.
+    return out;
   }
 };
 
-struct dynamic_spec_getter {
+struct width_checker {
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-    return is_negative(value) ? ~0ull : static_cast<unsigned long long>(value);
+    if (is_negative(value)) throw_format_error("negative width");
+    return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-    report_error("width/precision is not integer");
+    throw_format_error("width is not integer");
     return 0;
   }
 };
 
-template <typename Context, typename ID>
-FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> basic_format_arg<Context> {
-  auto arg = ctx.arg(id);
-  if (!arg) report_error("argument not found");
-  return arg;
-}
+struct precision_checker {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    if (is_negative(value)) throw_format_error("negative precision");
+    return static_cast<unsigned long long>(value);
+  }
 
-template <typename Context>
-FMT_CONSTEXPR int get_dynamic_spec(
-    arg_id_kind kind, const arg_ref<typename Context::char_type>& ref,
-    Context& ctx) {
-  FMT_ASSERT(kind != arg_id_kind::none, "");
-  auto arg =
-      kind == arg_id_kind::index ? ctx.arg(ref.index) : ctx.arg(ref.name);
-  if (!arg) report_error("argument not found");
-  unsigned long long value = arg.visit(dynamic_spec_getter());
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    throw_format_error("precision is not integer");
+    return 0;
+  }
+};
+
+template <typename Handler, typename FormatArg>
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
+  unsigned long long value = arg.visit(Handler());
   if (value > to_unsigned(max_value<int>()))
-    report_error("width/precision is out of range");
+    throw_format_error("number is too big");
   return static_cast<int>(value);
 }
 
-template <typename Context>
-FMT_CONSTEXPR void handle_dynamic_spec(
-    arg_id_kind kind, int& value,
-    const arg_ref<typename Context::char_type>& ref, Context& ctx) {
-  if (kind != arg_id_kind::none) value = get_dynamic_spec(kind, ref, ctx);
+template <typename Context, typename ID>
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> decltype(ctx.arg(id)) {
+  auto arg = ctx.arg(id);
+  if (!arg) ctx.on_error("argument not found");
+  return arg;
 }
 
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename Handler, typename Context>
+FMT_CONSTEXPR void handle_dynamic_spec(int& value,
+                                       arg_ref<typename Context::char_type> ref,
+                                       Context& ctx) {
+  switch (ref.kind) {
+  case arg_id_kind::none:
+    break;
+  case arg_id_kind::index:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index));
+    break;
+  case arg_id_kind::name:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name));
+    break;
+  }
+}
+
+#if FMT_USE_USER_DEFINED_LITERALS
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
 template <typename T, typename Char, size_t N,
-          fmt::detail::fixed_string<Char, N> Str>
-struct static_named_arg : view {
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct statically_named_arg : view {
   static constexpr auto name = Str.data;
 
   const T& value;
-  static_named_arg(const T& v) : value(v) {}
+  statically_named_arg(const T& v) : value(v) {}
 };
 
 template <typename T, typename Char, size_t N,
-          fmt::detail::fixed_string<Char, N> Str>
-struct is_named_arg<static_named_arg<T, Char, N, Str>> : std::true_type {};
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_named_arg<statically_named_arg<T, Char, N, Str>> : std::true_type {};
 
 template <typename T, typename Char, size_t N,
-          fmt::detail::fixed_string<Char, N> Str>
-struct is_static_named_arg<static_named_arg<T, Char, N, Str>> : std::true_type {
-};
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_statically_named_arg<statically_named_arg<T, Char, N, Str>>
+    : std::true_type {};
 
-template <typename Char, size_t N, fmt::detail::fixed_string<Char, N> Str>
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
 struct udl_arg {
   template <typename T> auto operator=(T&& value) const {
-    return static_named_arg<T, Char, N, Str>(std::forward<T>(value));
+    return statically_named_arg<T, Char, N, Str>(std::forward<T>(value));
   }
 };
-#else
+#  else
 template <typename Char> struct udl_arg {
   const Char* str;
 
@@ -3572,198 +3957,149 @@ template <typename Char> struct udl_arg {
     return {str, std::forward<T>(value)};
   }
 };
-#endif  // FMT_USE_NONTYPE_TEMPLATE_ARGS
+#  endif
+#endif  // FMT_USE_USER_DEFINED_LITERALS
 
-template <typename Char> struct format_handler {
-  parse_context<Char> parse_ctx;
-  buffered_context<Char> ctx;
-
-  void on_text(const Char* begin, const Char* end) {
-    copy_noinline<Char>(begin, end, ctx.out());
-  }
-
-  FMT_CONSTEXPR auto on_arg_id() -> int { return parse_ctx.next_arg_id(); }
-  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-    parse_ctx.check_arg_id(id);
-    return id;
-  }
-  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
-    parse_ctx.check_arg_id(id);
-    int arg_id = ctx.arg_id(id);
-    if (arg_id < 0) report_error("argument not found");
-    return arg_id;
-  }
-
-  FMT_INLINE void on_replacement_field(int id, const Char*) {
-    ctx.arg(id).visit(default_arg_formatter<Char>{ctx.out()});
-  }
-
-  auto on_format_specs(int id, const Char* begin, const Char* end)
-      -> const Char* {
-    auto arg = get_arg(ctx, id);
-    // Not using a visitor for custom types gives better codegen.
-    if (arg.format_custom(begin, parse_ctx, ctx)) return parse_ctx.begin();
-
-    auto specs = dynamic_format_specs<Char>();
-    begin = parse_format_specs(begin, end, specs, parse_ctx, arg.type());
-    if (specs.dynamic()) {
-      handle_dynamic_spec(specs.dynamic_width(), specs.width, specs.width_ref,
-                          ctx);
-      handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
-                          specs.precision_ref, ctx);
-    }
-
-    arg.visit(arg_formatter<Char>{ctx.out(), specs, ctx.locale()});
-    return begin;
-  }
-
-  FMT_NORETURN void on_error(const char* message) { report_error(message); }
-};
+template <typename Locale, typename Char>
+auto vformat(const Locale& loc, basic_string_view<Char> fmt,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return {buf.data(), buf.size()};
+}
 
 using format_func = void (*)(detail::buffer<char>&, int, const char*);
-FMT_API void do_report_error(format_func func, int error_code,
-                             const char* message) noexcept;
 
 FMT_API void format_error_code(buffer<char>& out, int error_code,
                                string_view message) noexcept;
 
-template <typename T, typename Char, type TYPE>
-template <typename FormatContext>
-FMT_CONSTEXPR auto native_formatter<T, Char, TYPE>::format(
-    const T& val, FormatContext& ctx) const -> decltype(ctx.out()) {
-  if (!specs_.dynamic())
-    return write<Char>(ctx.out(), val, specs_, ctx.locale());
-  auto specs = format_specs(specs_);
-  handle_dynamic_spec(specs.dynamic_width(), specs.width, specs_.width_ref,
-                      ctx);
-  handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
-                      specs_.precision_ref, ctx);
-  return write<Char>(ctx.out(), val, specs, ctx.locale());
-}
-
-// DEPRECATED! https://github.com/fmtlib/fmt/issues/4292.
-template <typename T, typename Enable = void>
-struct is_locale : std::false_type {};
-template <typename T>
-struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
-
-// DEPRECATED!
-template <typename Char = char> struct vformat_args {
-  using type = basic_format_args<buffered_context<Char>>;
-};
-template <> struct vformat_args<char> {
-  using type = format_args;
-};
-
-template <typename Char>
-void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
-                typename vformat_args<Char>::type args, locale_ref loc = {}) {
-  auto out = basic_appender<Char>(buf);
-  parse_format_string(
-      fmt, format_handler<Char>{parse_context<Char>(fmt), {out, args, loc}});
-}
+FMT_API void report_error(format_func func, int error_code,
+                          const char* message) noexcept;
 }  // namespace detail
 
-FMT_BEGIN_EXPORT
+FMT_API auto vsystem_error(int error_code, string_view format_str,
+                           format_args args) -> std::system_error;
 
-// A generic formatting context with custom output iterator and character
-// (code unit) support. Char is the format string code unit type which can be
-// different from OutputIt::value_type.
-template <typename OutputIt, typename Char> class generic_context {
+/**
+  \rst
+  Constructs :class:`std::system_error` with a message formatted with
+  ``fmt::format(fmt, args...)``.
+  *error_code* is a system error code as given by ``errno``.
+
+  **Example**::
+
+    // This throws std::system_error with the description
+    //   cannot open file 'madeup': No such file or directory
+    // or similar (system message may vary).
+    const char* filename = "madeup";
+    std::FILE* file = std::fopen(filename, "r");
+    if (!file)
+      throw fmt::system_error(errno, "cannot open file '{}'", filename);
+  \endrst
+ */
+template <typename... T>
+auto system_error(int error_code, format_string<T...> fmt, T&&... args)
+    -> std::system_error {
+  return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
+}
+
+/**
+  \rst
+  Formats an error message for an error returned by an operating system or a
+  language runtime, for example a file opening error, and writes it to *out*.
+  The format is the same as the one used by ``std::system_error(ec, message)``
+  where ``ec`` is ``std::error_code(error_code, std::generic_category()})``.
+  It is implementation-defined but normally looks like:
+
+  .. parsed-literal::
+     *<message>*: *<system-message>*
+
+  where *<message>* is the passed message and *<system-message>* is the system
+  message corresponding to the error code.
+  *error_code* is a system error code as given by ``errno``.
+  \endrst
+ */
+FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
+                                 const char* message) noexcept;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code, const char* message) noexcept;
+
+/** Fast integer formatter. */
+class format_int {
  private:
-  OutputIt out_;
-  basic_format_args<generic_context> args_;
-  detail::locale_ref loc_;
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
+  mutable char buffer_[buffer_size];
+  char* str_;
+
+  template <typename UInt> auto format_unsigned(UInt value) -> char* {
+    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
+    return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
+  }
+
+  template <typename Int> auto format_signed(Int value) -> char* {
+    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
+    bool negative = value < 0;
+    if (negative) abs_value = 0 - abs_value;
+    auto begin = format_unsigned(abs_value);
+    if (negative) *--begin = '-';
+    return begin;
+  }
 
  public:
-  using char_type = Char;
-  using iterator = OutputIt;
-  using parse_context_type FMT_DEPRECATED = parse_context<Char>;
-  template <typename T>
-  using formatter_type FMT_DEPRECATED = formatter<T, Char>;
-  enum { builtin_types = FMT_BUILTIN_TYPES };
+  explicit format_int(int value) : str_(format_signed(value)) {}
+  explicit format_int(long value) : str_(format_signed(value)) {}
+  explicit format_int(long long value) : str_(format_signed(value)) {}
+  explicit format_int(unsigned value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long long value)
+      : str_(format_unsigned(value)) {}
 
-  constexpr generic_context(OutputIt out,
-                            basic_format_args<generic_context> args,
-                            detail::locale_ref loc = {})
-      : out_(out), args_(args), loc_(loc) {}
-  generic_context(generic_context&&) = default;
-  generic_context(const generic_context&) = delete;
-  void operator=(const generic_context&) = delete;
-
-  constexpr auto arg(int id) const -> basic_format_arg<generic_context> {
-    return args_.get(id);
-  }
-  auto arg(basic_string_view<Char> name) const
-      -> basic_format_arg<generic_context> {
-    return args_.get(name);
-  }
-  constexpr auto arg_id(basic_string_view<Char> name) const -> int {
-    return args_.get_id(name);
+  /** Returns the number of characters written to the output buffer. */
+  auto size() const -> size_t {
+    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
   }
 
-  constexpr auto out() const -> iterator { return out_; }
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  auto data() const -> const char* { return str_; }
 
-  void advance_to(iterator it) {
-    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  auto c_str() const -> const char* {
+    buffer_[buffer_size - 1] = '\0';
+    return str_;
   }
 
-  constexpr auto locale() const -> detail::locale_ref { return loc_; }
+  /**
+    \rst
+    Returns the content of the output buffer as an ``std::string``.
+    \endrst
+   */
+  auto str() const -> std::string { return std::string(str_, size()); }
 };
 
-class loc_value {
- private:
-  basic_format_arg<context> value_;
-
- public:
-  template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
-  loc_value(T value) : value_(value) {}
-
-  template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
-  loc_value(T) {}
-
-  template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
-    return value_.visit(vis);
+template <typename T, typename Char>
+struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
+    : formatter<detail::format_as_t<T>, Char> {
+  template <typename FormatContext>
+  auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
+    using base = formatter<detail::format_as_t<T>, Char>;
+    return base::format(format_as(value), ctx);
   }
 };
 
-// A locale facet that formats values in UTF-8.
-// It is parameterized on the locale to avoid the heavy <locale> include.
-template <typename Locale> class format_facet : public Locale::facet {
- private:
-  std::string separator_;
-  std::string grouping_;
-  std::string decimal_point_;
-
- protected:
-  virtual auto do_put(appender out, loc_value val,
-                      const format_specs& specs) const -> bool;
-
- public:
-  static FMT_API typename Locale::id id;
-
-  explicit format_facet(Locale& loc);
-  explicit format_facet(string_view sep = "", std::string grouping = "\3",
-                        std::string decimal_point = ".")
-      : separator_(sep.data(), sep.size()),
-        grouping_(grouping),
-        decimal_point_(decimal_point) {}
-
-  auto put(appender out, loc_value val, const format_specs& specs) const
-      -> bool {
-    return do_put(out, val, specs);
-  }
-};
-
-#define FMT_FORMAT_AS(Type, Base)                                   \
-  template <typename Char>                                          \
-  struct formatter<Type, Char> : formatter<Base, Char> {            \
-    template <typename FormatContext>                               \
-    FMT_CONSTEXPR auto format(Type value, FormatContext& ctx) const \
-        -> decltype(ctx.out()) {                                    \
-      return formatter<Base, Char>::format(value, ctx);             \
-    }                                                               \
-  }
+#define FMT_FORMAT_AS(Type, Base) \
+  template <typename Char>        \
+  struct formatter<Type, Char> : formatter<Base, Char> {}
 
 FMT_FORMAT_AS(signed char, int);
 FMT_FORMAT_AS(unsigned char, unsigned);
@@ -3772,58 +4108,44 @@ FMT_FORMAT_AS(unsigned short, unsigned);
 FMT_FORMAT_AS(long, detail::long_type);
 FMT_FORMAT_AS(unsigned long, detail::ulong_type);
 FMT_FORMAT_AS(Char*, const Char*);
-FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
 FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
 FMT_FORMAT_AS(void*, const void*);
 
 template <typename Char, size_t N>
 struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
 
-template <typename Char, typename Traits, typename Allocator>
-class formatter<std::basic_string<Char, Traits, Allocator>, Char>
-    : public formatter<basic_string_view<Char>, Char> {};
-
-template <int N, typename Char>
-struct formatter<detail::bitint<N>, Char> : formatter<long long, Char> {};
-template <int N, typename Char>
-struct formatter<detail::ubitint<N>, Char>
-    : formatter<unsigned long long, Char> {};
-
-template <typename Char>
-struct formatter<detail::float128, Char>
-    : detail::native_formatter<detail::float128, Char,
-                               detail::type::float_type> {};
-
-template <typename T, typename Char>
-struct formatter<T, Char, void_t<detail::format_as_result<T>>>
-    : formatter<detail::format_as_result<T>, Char> {
-  template <typename FormatContext>
-  FMT_CONSTEXPR auto format(const T& value, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    auto&& val = format_as(value);  // Make an lvalue reference for format.
-    return formatter<detail::format_as_result<T>, Char>::format(val, ctx);
-  }
-};
-
 /**
- * Converts `p` to `const void*` for pointer formatting.
- *
- * **Example**:
- *
- *     auto s = fmt::format("{}", fmt::ptr(p));
+  \rst
+  Converts ``p`` to ``const void*`` for pointer formatting.
+
+  **Example**::
+
+    auto s = fmt::format("{}", fmt::ptr(p));
+  \endrst
  */
 template <typename T> auto ptr(T p) -> const void* {
   static_assert(std::is_pointer<T>::value, "");
   return detail::bit_cast<const void*>(p);
 }
+template <typename T, typename Deleter>
+auto ptr(const std::unique_ptr<T, Deleter>& p) -> const void* {
+  return p.get();
+}
+template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
+  return p.get();
+}
 
 /**
- * Converts `e` to the underlying type.
- *
- * **Example**:
- *
- *     enum class color { red, green, blue };
- *     auto s = fmt::format("{}", fmt::underlying(color::red));  // s == "0"
+  \rst
+  Converts ``e`` to the underlying type.
+
+  **Example**::
+
+    enum class color { red, green, blue };
+    auto s = fmt::format("{}", fmt::underlying(color::red));
+  \endrst
  */
 template <typename Enum>
 constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
@@ -3837,22 +4159,13 @@ constexpr auto format_as(Enum e) noexcept -> underlying_t<Enum> {
 }
 }  // namespace enums
 
-#ifdef __cpp_lib_byte
-template <> struct formatter<std::byte> : formatter<unsigned> {
-  static auto format_as(std::byte b) -> unsigned char {
-    return static_cast<unsigned char>(b);
-  }
-  template <typename Context>
-  auto format(std::byte b, Context& ctx) const -> decltype(ctx.out()) {
-    return formatter<unsigned>::format(format_as(b), ctx);
-  }
-};
-#endif
+class bytes {
+ private:
+  string_view data_;
+  friend struct formatter<bytes>;
 
-struct bytes {
-  string_view data;
-
-  inline explicit bytes(string_view s) : data(s) {}
+ public:
+  explicit bytes(string_view data) : data_(data) {}
 };
 
 template <> struct formatter<bytes> {
@@ -3860,19 +4173,19 @@ template <> struct formatter<bytes> {
   detail::dynamic_format_specs<> specs_;
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<>& ctx) -> const char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
     return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
                               detail::type::string_type);
   }
 
   template <typename FormatContext>
-  auto format(bytes b, FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto specs = specs_;
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width,
-                                specs.width_ref, ctx);
-    detail::handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
-                                specs.precision_ref, ctx);
-    return detail::write_bytes<char>(ctx.out(), b.data, specs);
+  auto format(bytes b, FormatContext& ctx) -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    return detail::write_bytes(ctx.out(), b.data_, specs_);
   }
 };
 
@@ -3882,13 +4195,15 @@ template <typename T> struct group_digits_view {
 };
 
 /**
- * Returns a view that formats an integer value using ',' as a
- * locale-independent thousands separator.
- *
- * **Example**:
- *
- *     fmt::print("{}", fmt::group_digits(12345));
- *     // Output: "12,345"
+  \rst
+  Returns a view that formats an integer value using ',' as a locale-independent
+  thousands separator.
+
+  **Example**::
+
+    fmt::print("{}", fmt::group_digits(12345));
+    // Output: "12,345"
+  \endrst
  */
 template <typename T> auto group_digits(T value) -> group_digits_view<T> {
   return {value};
@@ -3899,255 +4214,253 @@ template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
   detail::dynamic_format_specs<> specs_;
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<>& ctx) -> const char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
     return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
                               detail::type::int_type);
   }
 
   template <typename FormatContext>
-  auto format(group_digits_view<T> view, FormatContext& ctx) const
+  auto format(group_digits_view<T> t, FormatContext& ctx)
       -> decltype(ctx.out()) {
-    auto specs = specs_;
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width,
-                                specs.width_ref, ctx);
-    detail::handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
-                                specs.precision_ref, ctx);
-    auto arg = detail::make_write_int_arg(view.value, specs.sign());
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
     return detail::write_int(
-        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(arg.abs_value),
-        arg.prefix, specs, detail::digit_grouping<char>("\3", ","));
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(t.value), 0, specs_,
+        detail::digit_grouping<char>("\3", ","));
   }
 };
 
-template <typename T, typename Char> struct nested_view {
-  const formatter<T, Char>* fmt;
+template <typename T> struct nested_view {
+  const formatter<T>* fmt;
   const T* value;
 };
 
-template <typename T, typename Char>
-struct formatter<nested_view<T, Char>, Char> {
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+template <typename T> struct formatter<nested_view<T>> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
     return ctx.begin();
   }
-  template <typename FormatContext>
-  auto format(nested_view<T, Char> view, FormatContext& ctx) const
+  auto format(nested_view<T> view, format_context& ctx) const
       -> decltype(ctx.out()) {
     return view.fmt->format(*view.value, ctx);
   }
 };
 
-template <typename T, typename Char = char> struct nested_formatter {
+template <typename T> struct nested_formatter {
  private:
-  basic_specs specs_;
   int width_;
-  formatter<T, Char> formatter_;
+  detail::fill_t<char> fill_;
+  align_t align_ : 4;
+  formatter<T> formatter_;
 
  public:
-  constexpr nested_formatter() : width_(0) {}
+  constexpr nested_formatter() : width_(0), align_(align_t::none) {}
 
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin(), end = ctx.end();
-    if (it == end) return it;
-    auto specs = format_specs();
-    it = detail::parse_align(it, end, specs);
-    specs_ = specs;
-    Char c = *it;
-    auto width_ref = detail::arg_ref<Char>();
-    if ((c >= '0' && c <= '9') || c == '{') {
-      it = detail::parse_width(it, end, specs, width_ref, ctx);
-      width_ = specs.width;
-    }
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
+    auto specs = detail::dynamic_format_specs<char>();
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), specs, ctx,
+                                 detail::type::none_type);
+    width_ = specs.width;
+    fill_ = specs.fill;
+    align_ = specs.align;
     ctx.advance_to(it);
     return formatter_.parse(ctx);
   }
 
-  template <typename FormatContext, typename F>
-  auto write_padded(FormatContext& ctx, F write) const -> decltype(ctx.out()) {
+  template <typename F>
+  auto write_padded(format_context& ctx, F write) const -> decltype(ctx.out()) {
     if (width_ == 0) return write(ctx.out());
-    auto buf = basic_memory_buffer<Char>();
-    write(basic_appender<Char>(buf));
-    auto specs = format_specs();
+    auto buf = memory_buffer();
+    write(std::back_inserter(buf));
+    auto specs = format_specs<>();
     specs.width = width_;
-    specs.copy_fill_from(specs_);
-    specs.set_align(specs_.align());
-    return detail::write<Char>(
-        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+    specs.fill = fill_;
+    specs.align = align_;
+    return detail::write(ctx.out(), string_view(buf.data(), buf.size()), specs);
   }
 
-  auto nested(const T& value) const -> nested_view<T, Char> {
-    return nested_view<T, Char>{&formatter_, &value};
+  auto nested(const T& value) const -> nested_view<T> {
+    return nested_view<T>{&formatter_, &value};
   }
 };
 
-inline namespace literals {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <detail::fixed_string S> constexpr auto operator""_a() {
-  using char_t = remove_cvref_t<decltype(*S.data)>;
-  return detail::udl_arg<char_t, sizeof(S.data) / sizeof(char_t), S>();
-}
-#else
 /**
- * User-defined literal equivalent of `fmt::arg`.
- *
- * **Example**:
- *
- *     using namespace fmt::literals;
- *     fmt::print("The answer is {answer}.", "answer"_a=42);
+  \rst
+  Converts *value* to ``std::string`` using the default format for type *T*.
+
+  **Example**::
+
+    #include <fmt/format.h>
+
+    std::string answer = fmt::to_string(42);
+  \endrst
  */
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    !detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  auto buffer = memory_buffer();
+  detail::write<char>(appender(buffer), value);
+  return {buffer.data(), buffer.size()};
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+FMT_NODISCARD inline auto to_string(T value) -> std::string {
+  // The buffer should be large enough to store the number including the sign
+  // or "false" for bool.
+  constexpr int max_size = detail::digits10<T>() + 2;
+  char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
+  char* begin = buffer;
+  return std::string(begin, detail::write<char>(begin, value));
+}
+
+template <typename Char, size_t SIZE>
+FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE>& buf)
+    -> std::basic_string<Char> {
+  auto size = buf.size();
+  detail::assume(size < std::basic_string<Char>().max_size());
+  return std::basic_string<Char>(buf.data(), size);
+}
+
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  return to_string(format_as(value));
+}
+
+FMT_END_EXPORT
+
+namespace detail {
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc) {
+  auto out = buffer_appender<Char>(buf);
+  if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) throw_format_error("argument not found");
+    arg.visit(default_arg_formatter<Char>{out, args, loc});
+    return;
+  }
+
+  struct format_handler {
+    basic_format_parse_context<Char> parse_context;
+    buffer_context<Char> context;
+
+    format_handler(buffer_appender<Char> p_out, basic_string_view<Char> str,
+                   basic_format_args<buffer_context<Char>> p_args,
+                   locale_ref p_loc)
+        : parse_context(str), context(p_out, p_args, p_loc) {}
+
+    void on_text(const Char* begin, const Char* end) {
+      auto text = basic_string_view<Char>(begin, to_unsigned(end - begin));
+      context.advance_to(write<Char>(context.out(), text));
+    }
+
+    FMT_CONSTEXPR auto on_arg_id() -> int {
+      return parse_context.next_arg_id();
+    }
+    FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+      return parse_context.check_arg_id(id), id;
+    }
+    FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+      int arg_id = context.arg_id(id);
+      if (arg_id < 0) throw_format_error("argument not found");
+      return arg_id;
+    }
+
+    FMT_INLINE void on_replacement_field(int id, const Char*) {
+      auto arg = get_arg(context, id);
+      context.advance_to(arg.visit(default_arg_formatter<Char>{
+          context.out(), context.args(), context.locale()}));
+    }
+
+    auto on_format_specs(int id, const Char* begin, const Char* end)
+        -> const Char* {
+      auto arg = get_arg(context, id);
+      // Not using a visitor for custom types gives better codegen.
+      if (arg.format_custom(begin, parse_context, context))
+        return parse_context.begin();
+      auto specs = detail::dynamic_format_specs<Char>();
+      begin = parse_format_specs(begin, end, specs, parse_context, arg.type());
+      detail::handle_dynamic_spec<detail::width_checker>(
+          specs.width, specs.width_ref, context);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, context);
+      if (begin == end || *begin != '}')
+        throw_format_error("missing '}' in format string");
+      context.advance_to(arg.visit(
+          arg_formatter<Char>{context.out(), specs, context.locale()}));
+      return begin;
+    }
+
+    void on_error(const char* message) { throw_format_error(message); }
+  };
+  detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
+}
+
+FMT_BEGIN_EXPORT
+
+#ifndef FMT_HEADER_ONLY
+extern template FMT_API void vformat_to(buffer<char>&, string_view,
+                                        typename vformat_args<>::type,
+                                        locale_ref);
+extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
+    -> thousands_sep_result<char>;
+extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+#endif  // FMT_HEADER_ONLY
+
+}  // namespace detail
+
+#if FMT_USE_USER_DEFINED_LITERALS
+inline namespace literals {
+/**
+  \rst
+  User-defined literal equivalent of :func:`fmt::arg`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+  \endrst
+ */
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
+  using char_t = remove_cvref_t<decltype(Str.data[0])>;
+  return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
+}
+#  else
 constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
   return {s};
 }
-#endif  // FMT_USE_NONTYPE_TEMPLATE_ARGS
+#  endif
 }  // namespace literals
-
-/// A fast integer formatter.
-class format_int {
- private:
-  // Buffer should be large enough to hold all digits (digits10 + 1),
-  // a sign and a null character.
-  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
-  mutable char buffer_[buffer_size];
-  char* str_;
-
-  template <typename UInt>
-  FMT_CONSTEXPR20 auto format_unsigned(UInt value) -> char* {
-    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
-    return detail::do_format_decimal(buffer_, n, buffer_size - 1);
-  }
-
-  template <typename Int>
-  FMT_CONSTEXPR20 auto format_signed(Int value) -> char* {
-    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
-    bool negative = value < 0;
-    if (negative) abs_value = 0 - abs_value;
-    auto begin = format_unsigned(abs_value);
-    if (negative) *--begin = '-';
-    return begin;
-  }
-
- public:
-  FMT_CONSTEXPR20 explicit format_int(int value) : str_(format_signed(value)) {}
-  FMT_CONSTEXPR20 explicit format_int(long value)
-      : str_(format_signed(value)) {}
-  FMT_CONSTEXPR20 explicit format_int(long long value)
-      : str_(format_signed(value)) {}
-  FMT_CONSTEXPR20 explicit format_int(unsigned value)
-      : str_(format_unsigned(value)) {}
-  FMT_CONSTEXPR20 explicit format_int(unsigned long value)
-      : str_(format_unsigned(value)) {}
-  FMT_CONSTEXPR20 explicit format_int(unsigned long long value)
-      : str_(format_unsigned(value)) {}
-
-  /// Returns the number of characters written to the output buffer.
-  FMT_CONSTEXPR20 auto size() const -> size_t {
-    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
-  }
-
-  /// Returns a pointer to the output buffer content. No terminating null
-  /// character is appended.
-  FMT_CONSTEXPR20 auto data() const -> const char* { return str_; }
-
-  /// Returns a pointer to the output buffer content with terminating null
-  /// character appended.
-  FMT_CONSTEXPR20 auto c_str() const -> const char* {
-    buffer_[buffer_size - 1] = '\0';
-    return str_;
-  }
-
-  /// Returns the content of the output buffer as an `std::string`.
-  inline auto str() const -> std::string { return {str_, size()}; }
-};
-
-#define FMT_STRING_IMPL(s, base)                                              \
-  [] {                                                                        \
-    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
-    /* Use a macro-like name to avoid shadowing warnings. */                  \
-    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
-      using char_type = fmt::remove_cvref_t<decltype(s[0])>;                  \
-      constexpr explicit operator fmt::basic_string_view<char_type>() const { \
-        return fmt::detail::compile_string_to_view<char_type>(s);             \
-      }                                                                       \
-    };                                                                        \
-    using FMT_STRING_VIEW =                                                   \
-        fmt::basic_string_view<typename FMT_COMPILE_STRING::char_type>;       \
-    fmt::detail::ignore_unused(FMT_STRING_VIEW(FMT_COMPILE_STRING()));        \
-    return FMT_COMPILE_STRING();                                              \
-  }()
-
-/**
- * Constructs a legacy compile-time format string from a string literal `s`.
- *
- * **Example**:
- *
- *     // A compile-time error because 'd' is an invalid specifier for strings.
- *     std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
- */
-#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string)
-
-FMT_API auto vsystem_error(int error_code, string_view fmt, format_args args)
-    -> std::system_error;
-
-/**
- * Constructs `std::system_error` with a message formatted with
- * `fmt::format(fmt, args...)`.
- * `error_code` is a system error code as given by `errno`.
- *
- * **Example**:
- *
- *     // This throws std::system_error with the description
- *     //   cannot open file 'madeup': No such file or directory
- *     // or similar (system message may vary).
- *     const char* filename = "madeup";
- *     FILE* file = fopen(filename, "r");
- *     if (!file)
- *       throw fmt::system_error(errno, "cannot open file '{}'", filename);
- */
-template <typename... T>
-auto system_error(int error_code, format_string<T...> fmt, T&&... args)
-    -> std::system_error {
-  return vsystem_error(error_code, fmt.str, vargs<T...>{{args...}});
-}
-
-/**
- * Formats an error message for an error returned by an operating system or a
- * language runtime, for example a file opening error, and writes it to `out`.
- * The format is the same as the one used by `std::system_error(ec, message)`
- * where `ec` is `std::error_code(error_code, std::generic_category())`.
- * It is implementation-defined but normally looks like:
- *
- *     <message>: <system-message>
- *
- * where `<message>` is the passed message and `<system-message>` is the system
- * message corresponding to the error code.
- * `error_code` is a system error code as given by `errno`.
- */
-FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
-                                 const char* message) noexcept;
-
-// Reports a system error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_system_error(int error_code, const char* message) noexcept;
+#endif  // FMT_USE_USER_DEFINED_LITERALS
 
 template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
 inline auto vformat(const Locale& loc, string_view fmt, format_args args)
     -> std::string {
-  auto buf = memory_buffer();
-  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
-  return {buf.data(), buf.size()};
+  return detail::vformat(loc, fmt, args);
 }
 
 template <typename Locale, typename... T,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
-FMT_INLINE auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
+inline auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
     -> std::string {
-  return vformat(loc, fmt.str, vargs<T...>{{args...}});
+  return fmt::vformat(loc, string_view(fmt), fmt::make_format_args(args...));
 }
 
 template <typename OutputIt, typename Locale,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
 auto vformat_to(OutputIt out, const Locale& loc, string_view fmt,
                 format_args args) -> OutputIt {
-  auto&& buf = detail::get_buffer<char>(out);
+  using detail::get_buffer;
+  auto&& buf = get_buffer<char>(out);
   detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
   return detail::get_iterator(buf, out);
 }
@@ -4157,7 +4470,7 @@ template <typename OutputIt, typename Locale, typename... T,
                             detail::is_locale<Locale>::value)>
 FMT_INLINE auto format_to(OutputIt out, const Locale& loc,
                           format_string<T...> fmt, T&&... args) -> OutputIt {
-  return fmt::vformat_to(out, loc, fmt.str, vargs<T...>{{args...}});
+  return vformat_to(out, loc, fmt, fmt::make_format_args(args...));
 }
 
 template <typename Locale, typename... T,
@@ -4166,67 +4479,40 @@ FMT_NODISCARD FMT_INLINE auto formatted_size(const Locale& loc,
                                              format_string<T...> fmt,
                                              T&&... args) -> size_t {
   auto buf = detail::counting_buffer<>();
-  detail::vformat_to(buf, fmt.str, vargs<T...>{{args...}},
-                     detail::locale_ref(loc));
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...),
+                           detail::locale_ref(loc));
   return buf.count();
 }
 
-FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
-
-/**
- * Formats `args` according to specifications in `fmt` and returns the result
- * as a string.
- *
- * **Example**:
- *
- *     #include <fmt/format.h>
- *     std::string message = fmt::format("The answer is {}.", 42);
- */
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
-    -> std::string {
-  return vformat(fmt.str, vargs<T...>{{args...}});
-}
-
-/**
- * Converts `value` to `std::string` using the default format for type `T`.
- *
- * **Example**:
- *
- *     std::string answer = fmt::to_string(42);
- */
-template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-FMT_NODISCARD auto to_string(T value) -> std::string {
-  // The buffer should be large enough to store the number including the sign
-  // or "false" for bool.
-  char buffer[max_of(detail::digits10<T>() + 2, 5)];
-  return {buffer, detail::write<char>(buffer, value)};
-}
-
-template <typename T, FMT_ENABLE_IF(detail::use_format_as<T>::value)>
-FMT_NODISCARD auto to_string(const T& value) -> std::string {
-  return to_string(format_as(value));
-}
-
-template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
-                                    !detail::use_format_as<T>::value)>
-FMT_NODISCARD auto to_string(const T& value) -> std::string {
-  auto buffer = memory_buffer();
-  detail::write<char>(appender(buffer), value);
-  return {buffer.data(), buffer.size()};
-}
-
 FMT_END_EXPORT
+
+template <typename T, typename Char>
+template <typename FormatContext>
+FMT_CONSTEXPR FMT_INLINE auto
+formatter<T, Char,
+          enable_if_t<detail::type_constant<T, Char>::value !=
+                      detail::type::custom_type>>::format(const T& val,
+                                                          FormatContext& ctx)
+    const -> decltype(ctx.out()) {
+  if (specs_.width_ref.kind == detail::arg_id_kind::none &&
+      specs_.precision_ref.kind == detail::arg_id_kind::none) {
+    return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
+  }
+  auto specs = specs_;
+  detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                     specs.width_ref, ctx);
+  detail::handle_dynamic_spec<detail::precision_checker>(
+      specs.precision, specs.precision_ref, ctx);
+  return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
+}
+
 FMT_END_NAMESPACE
 
 #ifdef FMT_HEADER_ONLY
 #  define FMT_FUNC inline
 #  include "format-inl.h"
-#endif
-
-// Restore _LIBCPP_REMOVE_TRANSITIVE_INCLUDES.
-#ifdef FMT_REMOVE_TRANSITIVE_INCLUDES
-#  undef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#else
+#  define FMT_FUNC
 #endif
 
 #endif  // FMT_FORMAT_H_
diff --git a/src/fmt/os.h b/src/fmt/os.h
index b2cc5e4b85..6009ccc112 100644
--- a/src/fmt/os.h
+++ b/src/fmt/os.h
@@ -8,18 +8,18 @@
 #ifndef FMT_OS_H_
 #define FMT_OS_H_
 
+#include <cerrno>
+#include <cstddef>
+#include <cstdio>
+#include <system_error>  // std::system_error
+
 #include "format.h"
 
-#ifndef FMT_MODULE
-#  include <cerrno>
-#  include <cstddef>
-#  include <cstdio>
-#  include <system_error>  // std::system_error
-
+#if defined __APPLE__ || defined(__FreeBSD__)
 #  if FMT_HAS_INCLUDE(<xlocale.h>)
-#    include <xlocale.h>  // LC_NUMERIC_MASK on macOS
+#    include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
 #  endif
-#endif  // FMT_MODULE
+#endif
 
 #ifndef FMT_USE_FCNTL
 // UWP doesn't provide _pipe.
@@ -77,33 +77,46 @@ FMT_BEGIN_NAMESPACE
 FMT_BEGIN_EXPORT
 
 /**
- * A reference to a null-terminated string. It can be constructed from a C
- * string or `std::string`.
- *
- * You can use one of the following type aliases for common character types:
- *
- * +---------------+-----------------------------+
- * | Type          | Definition                  |
- * +===============+=============================+
- * | cstring_view  | basic_cstring_view<char>    |
- * +---------------+-----------------------------+
- * | wcstring_view | basic_cstring_view<wchar_t> |
- * +---------------+-----------------------------+
- *
- * This class is most useful as a parameter type for functions that wrap C APIs.
+  \rst
+  A reference to a null-terminated string. It can be constructed from a C
+  string or ``std::string``.
+
+  You can use one of the following type aliases for common character types:
+
+  +---------------+-----------------------------+
+  | Type          | Definition                  |
+  +===============+=============================+
+  | cstring_view  | basic_cstring_view<char>    |
+  +---------------+-----------------------------+
+  | wcstring_view | basic_cstring_view<wchar_t> |
+  +---------------+-----------------------------+
+
+  This class is most useful as a parameter type to allow passing
+  different types of strings to a function, for example::
+
+    template <typename... Args>
+    std::string format(cstring_view format_str, const Args & ... args);
+
+    format("{}", 42);
+    format(std::string("{}"), 42);
+  \endrst
  */
 template <typename Char> class basic_cstring_view {
  private:
   const Char* data_;
 
  public:
-  /// Constructs a string reference object from a C string.
+  /** Constructs a string reference object from a C string. */
   basic_cstring_view(const Char* s) : data_(s) {}
 
-  /// Constructs a string reference from an `std::string` object.
+  /**
+    \rst
+    Constructs a string reference from an ``std::string`` object.
+    \endrst
+   */
   basic_cstring_view(const std::basic_string<Char>& s) : data_(s.c_str()) {}
 
-  /// Returns the pointer to a C string.
+  /** Returns the pointer to a C string. */
   auto c_str() const -> const Char* { return data_; }
 };
 
@@ -118,38 +131,41 @@ FMT_API void format_windows_error(buffer<char>& out, int error_code,
                                   const char* message) noexcept;
 }
 
-FMT_API std::system_error vwindows_error(int error_code, string_view fmt,
+FMT_API std::system_error vwindows_error(int error_code, string_view format_str,
                                          format_args args);
 
 /**
- * Constructs a `std::system_error` object with the description of the form
- *
- *     <message>: <system-message>
- *
- * where `<message>` is the formatted message and `<system-message>` is the
- * system message corresponding to the error code.
- * `error_code` is a Windows error code as given by `GetLastError`.
- * If `error_code` is not a valid error code such as -1, the system message
- * will look like "error -1".
- *
- * **Example**:
- *
- *     // This throws a system_error with the description
- *     //   cannot open file 'madeup': The system cannot find the file
- * specified.
- *     // or similar (system message may vary).
- *     const char *filename = "madeup";
- *     LPOFSTRUCT of = LPOFSTRUCT();
- *     HFILE file = OpenFile(filename, &of, OF_READ);
- *     if (file == HFILE_ERROR) {
- *       throw fmt::windows_error(GetLastError(),
- *                                "cannot open file '{}'", filename);
- *     }
- */
-template <typename... T>
-auto windows_error(int error_code, string_view message, const T&... args)
-    -> std::system_error {
-  return vwindows_error(error_code, message, vargs<T...>{{args...}});
+ \rst
+ Constructs a :class:`std::system_error` object with the description
+ of the form
+
+ .. parsed-literal::
+   *<message>*: *<system-message>*
+
+ where *<message>* is the formatted message and *<system-message>* is the
+ system message corresponding to the error code.
+ *error_code* is a Windows error code as given by ``GetLastError``.
+ If *error_code* is not a valid error code such as -1, the system message
+ will look like "error -1".
+
+ **Example**::
+
+   // This throws a system_error with the description
+   //   cannot open file 'madeup': The system cannot find the file specified.
+   // or similar (system message may vary).
+   const char *filename = "madeup";
+   LPOFSTRUCT of = LPOFSTRUCT();
+   HFILE file = OpenFile(filename, &of, OF_READ);
+   if (file == HFILE_ERROR) {
+     throw fmt::windows_error(GetLastError(),
+                              "cannot open file '{}'", filename);
+   }
+ \endrst
+*/
+template <typename... Args>
+std::system_error windows_error(int error_code, string_view message,
+                                const Args&... args) {
+  return vwindows_error(error_code, message, fmt::make_format_args(args...));
 }
 
 // Reports a Windows error without throwing an exception.
@@ -164,8 +180,8 @@ inline auto system_category() noexcept -> const std::error_category& {
 // std::system is not available on some platforms such as iOS (#2248).
 #ifdef __OSX__
 template <typename S, typename... Args, typename Char = char_t<S>>
-void say(const S& fmt, Args&&... args) {
-  std::system(format("say \"{}\"", format(fmt, args...)).c_str());
+void say(const S& format_str, Args&&... args) {
+  std::system(format("say \"{}\"", format(format_str, args...)).c_str());
 }
 #endif
 
@@ -176,24 +192,24 @@ class buffered_file {
 
   friend class file;
 
-  inline explicit buffered_file(FILE* f) : file_(f) {}
+  explicit buffered_file(FILE* f) : file_(f) {}
 
  public:
   buffered_file(const buffered_file&) = delete;
   void operator=(const buffered_file&) = delete;
 
   // Constructs a buffered_file object which doesn't represent any file.
-  inline buffered_file() noexcept : file_(nullptr) {}
+  buffered_file() noexcept : file_(nullptr) {}
 
   // Destroys the object closing the file it represents if any.
   FMT_API ~buffered_file() noexcept;
 
  public:
-  inline buffered_file(buffered_file&& other) noexcept : file_(other.file_) {
+  buffered_file(buffered_file&& other) noexcept : file_(other.file_) {
     other.file_ = nullptr;
   }
 
-  inline auto operator=(buffered_file&& other) -> buffered_file& {
+  auto operator=(buffered_file&& other) -> buffered_file& {
     close();
     file_ = other.file_;
     other.file_ = nullptr;
@@ -207,15 +223,17 @@ class buffered_file {
   FMT_API void close();
 
   // Returns the pointer to a FILE object representing this file.
-  inline auto get() const noexcept -> FILE* { return file_; }
+  auto get() const noexcept -> FILE* { return file_; }
 
   FMT_API auto descriptor() const -> int;
 
-  template <typename... T>
-  inline void print(string_view fmt, const T&... args) {
-    fmt::vargs<T...> vargs = {{args...}};
-    detail::is_locking<T...>() ? fmt::vprint_buffered(file_, fmt, vargs)
-                               : fmt::vprint(file_, fmt, vargs);
+  void vprint(string_view format_str, format_args args) {
+    fmt::vprint(file_, format_str, args);
+  }
+
+  template <typename... Args>
+  inline void print(string_view format_str, const Args&... args) {
+    vprint(format_str, fmt::make_format_args(args...));
   }
 };
 
@@ -248,7 +266,7 @@ class FMT_API file {
   };
 
   // Constructs a file object which doesn't represent any file.
-  inline file() noexcept : fd_(-1) {}
+  file() noexcept : fd_(-1) {}
 
   // Opens a file and constructs a file object representing this file.
   file(cstring_view path, int oflag);
@@ -257,10 +275,10 @@ class FMT_API file {
   file(const file&) = delete;
   void operator=(const file&) = delete;
 
-  inline file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
+  file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
 
   // Move assignment is not noexcept because close may throw.
-  inline auto operator=(file&& other) -> file& {
+  auto operator=(file&& other) -> file& {
     close();
     fd_ = other.fd_;
     other.fd_ = -1;
@@ -271,7 +289,7 @@ class FMT_API file {
   ~file() noexcept;
 
   // Returns the file descriptor.
-  inline auto descriptor() const noexcept -> int { return fd_; }
+  auto descriptor() const noexcept -> int { return fd_; }
 
   // Closes the file.
   void close();
@@ -324,9 +342,9 @@ auto getpagesize() -> long;
 namespace detail {
 
 struct buffer_size {
-  constexpr buffer_size() = default;
+  buffer_size() = default;
   size_t value = 0;
-  FMT_CONSTEXPR auto operator=(size_t val) const -> buffer_size {
+  auto operator=(size_t val) const -> buffer_size {
     auto bs = buffer_size();
     bs.value = val;
     return bs;
@@ -337,7 +355,7 @@ struct ostream_params {
   int oflag = file::WRONLY | file::CREATE | file::TRUNC;
   size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
 
-  constexpr ostream_params() {}
+  ostream_params() {}
 
   template <typename... T>
   ostream_params(T... params, int new_oflag) : ostream_params(params...) {
@@ -358,62 +376,80 @@ struct ostream_params {
 #  endif
 };
 
-}  // namespace detail
-
-FMT_INLINE_VARIABLE constexpr auto buffer_size = detail::buffer_size();
-
-/// A fast buffered output stream for writing from a single thread. Writing from
-/// multiple threads without external synchronization may result in a data race.
-class FMT_API ostream : private detail::buffer<char> {
+class file_buffer final : public buffer<char> {
  private:
   file file_;
 
-  ostream(cstring_view path, const detail::ostream_params& params);
-
-  static void grow(buffer<char>& buf, size_t);
+  FMT_API static void grow(buffer<char>& buf, size_t);
 
  public:
-  ostream(ostream&& other) noexcept;
-  ~ostream();
+  FMT_API file_buffer(cstring_view path, const ostream_params& params);
+  FMT_API file_buffer(file_buffer&& other);
+  FMT_API ~file_buffer();
 
-  operator writer() {
-    detail::buffer<char>& buf = *this;
-    return buf;
-  }
-
-  inline void flush() {
+  void flush() {
     if (size() == 0) return;
     file_.write(data(), size() * sizeof(data()[0]));
     clear();
   }
 
-  template <typename... T>
-  friend auto output_file(cstring_view path, T... params) -> ostream;
-
-  inline void close() {
+  void close() {
     flush();
     file_.close();
   }
+};
 
-  /// Formats `args` according to specifications in `fmt` and writes the
-  /// output to the file.
+}  // namespace detail
+
+// Added {} below to work around default constructor error known to
+// occur in Xcode versions 7.2.1 and 8.2.1.
+constexpr detail::buffer_size buffer_size{};
+
+/** A fast output stream which is not thread-safe. */
+class FMT_API ostream {
+ private:
+  FMT_MSC_WARNING(suppress : 4251)
+  detail::file_buffer buffer_;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : buffer_(path, params) {}
+
+ public:
+  ostream(ostream&& other) : buffer_(std::move(other.buffer_)) {}
+
+  ~ostream();
+
+  void flush() { buffer_.flush(); }
+
+  template <typename... T>
+  friend auto output_file(cstring_view path, T... params) -> ostream;
+
+  void close() { buffer_.close(); }
+
+  /**
+    Formats ``args`` according to specifications in ``fmt`` and writes the
+    output to the file.
+   */
   template <typename... T> void print(format_string<T...> fmt, T&&... args) {
-    vformat_to(appender(*this), fmt.str, vargs<T...>{{args...}});
+    vformat_to(std::back_inserter(buffer_), fmt,
+               fmt::make_format_args(args...));
   }
 };
 
 /**
- * Opens a file for writing. Supported parameters passed in `params`:
- *
- * - `<integer>`: Flags passed to [open](
- *   https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html)
- *   (`file::WRONLY | file::CREATE | file::TRUNC` by default)
- * - `buffer_size=<integer>`: Output buffer size
- *
- * **Example**:
- *
- *     auto out = fmt::output_file("guide.txt");
- *     out.print("Don't {}", "Panic");
+  \rst
+  Opens a file for writing. Supported parameters passed in *params*:
+
+  * ``<integer>``: Flags passed to `open
+    <https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html>`_
+    (``file::WRONLY | file::CREATE | file::TRUNC`` by default)
+  * ``buffer_size=<integer>``: Output buffer size
+
+  **Example**::
+
+    auto out = fmt::output_file("guide.txt");
+    out.print("Don't {}", "Panic");
+  \endrst
  */
 template <typename... T>
 inline auto output_file(cstring_view path, T... params) -> ostream {
diff --git a/src/fmt/ostream.h b/src/fmt/ostream.h
index 5d893c9216..26fb3b5ac0 100644
--- a/src/fmt/ostream.h
+++ b/src/fmt/ostream.h
@@ -8,9 +8,7 @@
 #ifndef FMT_OSTREAM_H_
 #define FMT_OSTREAM_H_
 
-#ifndef FMT_MODULE
-#  include <fstream>  // std::filebuf
-#endif
+#include <fstream>  // std::filebuf
 
 #ifdef _WIN32
 #  ifdef __GLIBCXX__
@@ -20,19 +18,42 @@
 #  include <io.h>
 #endif
 
-#include "chrono.h"  // formatbuf
-
-#ifdef _MSVC_STL_UPDATE
-#  define FMT_MSVC_STL_UPDATE _MSVC_STL_UPDATE
-#elif defined(_MSC_VER) && _MSC_VER < 1912  // VS 15.5
-#  define FMT_MSVC_STL_UPDATE _MSVC_LANG
-#else
-#  define FMT_MSVC_STL_UPDATE 0
-#endif
+#include "format.h"
 
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
 // Generate a unique explicit instantion in every translation unit using a tag
 // type in an anonymous namespace.
 namespace {
@@ -43,18 +64,53 @@ class file_access {
   friend auto get_file(BufType& obj) -> FILE* { return obj.*FileMemberPtr; }
 };
 
-#if FMT_MSVC_STL_UPDATE
+#if FMT_MSC_VERSION
 template class file_access<file_access_tag, std::filebuf,
                            &std::filebuf::_Myfile>;
 auto get_file(std::filebuf&) -> FILE*;
 #endif
 
+inline auto write_ostream_unicode(std::ostream& os, fmt::string_view data)
+    -> bool {
+  FILE* f = nullptr;
+#if FMT_MSC_VERSION
+  if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
+    f = get_file(*buf);
+  else
+    return false;
+#elif defined(_WIN32) && defined(__GLIBCXX__)
+  auto* rdbuf = os.rdbuf();
+  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
+    f = sfbuf->file();
+  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
+    f = fbuf->file();
+  else
+    return false;
+#else
+  ignore_unused(os, data, f);
+#endif
+#ifdef _WIN32
+  if (f) {
+    int fd = _fileno(f);
+    if (_isatty(fd)) {
+      os.flush();
+      return write_console(fd, data);
+    }
+  }
+#endif
+  return false;
+}
+inline auto write_ostream_unicode(std::wostream&,
+                                  fmt::basic_string_view<wchar_t>) -> bool {
+  return false;
+}
+
 // Write the content of buf to os.
 // It is a separate function rather than a part of vprint to simplify testing.
 template <typename Char>
 void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
   const Char* buf_data = buf.data();
-  using unsigned_streamsize = make_unsigned_t<std::streamsize>;
+  using unsigned_streamsize = std::make_unsigned<std::streamsize>::type;
   unsigned_streamsize size = buf.size();
   unsigned_streamsize max_size = to_unsigned(max_value<std::streamsize>());
   do {
@@ -65,9 +121,21 @@ void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
   } while (size != 0);
 }
 
+template <typename Char, typename T>
+void format_value(buffer<Char>& buf, const T& value) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& output = std::basic_ostream<Char>(&format_buf);
+#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+  output.imbue(std::locale::classic());  // The default is always unlocalized.
+#endif
+  output << value;
+  output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+}
+
 template <typename T> struct streamed_view {
   const T& value;
 };
+
 }  // namespace detail
 
 // Formats an object of type T that has an overloaded ostream operator<<.
@@ -75,14 +143,11 @@ template <typename Char>
 struct basic_ostream_formatter : formatter<basic_string_view<Char>, Char> {
   void set_debug_format() = delete;
 
-  template <typename T, typename Context>
-  auto format(const T& value, Context& ctx) const -> decltype(ctx.out()) {
+  template <typename T, typename OutputIt>
+  auto format(const T& value, basic_format_context<OutputIt, Char>& ctx) const
+      -> OutputIt {
     auto buffer = basic_memory_buffer<Char>();
-    auto&& formatbuf = detail::formatbuf<std::basic_streambuf<Char>>(buffer);
-    auto&& output = std::basic_ostream<Char>(&formatbuf);
-    output.imbue(std::locale::classic());  // The default is always unlocalized.
-    output << value;
-    output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+    detail::format_value(buffer, value);
     return formatter<basic_string_view<Char>, Char>::format(
         {buffer.data(), buffer.size()}, ctx);
   }
@@ -93,67 +158,73 @@ using ostream_formatter = basic_ostream_formatter<char>;
 template <typename T, typename Char>
 struct formatter<detail::streamed_view<T>, Char>
     : basic_ostream_formatter<Char> {
-  template <typename Context>
-  auto format(detail::streamed_view<T> view, Context& ctx) const
-      -> decltype(ctx.out()) {
+  template <typename OutputIt>
+  auto format(detail::streamed_view<T> view,
+              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
     return basic_ostream_formatter<Char>::format(view.value, ctx);
   }
 };
 
 /**
- * Returns a view that formats `value` via an ostream `operator<<`.
- *
- * **Example**:
- *
- *     fmt::print("Current thread id: {}\n",
- *                fmt::streamed(std::this_thread::get_id()));
+  \rst
+  Returns a view that formats `value` via an ostream ``operator<<``.
+
+  **Example**::
+
+    fmt::print("Current thread id: {}\n",
+               fmt::streamed(std::this_thread::get_id()));
+  \endrst
  */
 template <typename T>
 constexpr auto streamed(const T& value) -> detail::streamed_view<T> {
   return {value};
 }
 
-inline void vprint(std::ostream& os, string_view fmt, format_args args) {
+namespace detail {
+
+inline void vprint_directly(std::ostream& os, string_view format_str,
+                            format_args args) {
   auto buffer = memory_buffer();
-  detail::vformat_to(buffer, fmt, args);
-  FILE* f = nullptr;
-#if FMT_MSVC_STL_UPDATE && FMT_USE_RTTI
-  if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
-    f = detail::get_file(*buf);
-#elif defined(_WIN32) && defined(__GLIBCXX__) && FMT_USE_RTTI
-  auto* rdbuf = os.rdbuf();
-  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
-    f = sfbuf->file();
-  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
-    f = fbuf->file();
-#endif
-#ifdef _WIN32
-  if (f) {
-    int fd = _fileno(f);
-    if (_isatty(fd)) {
-      os.flush();
-      if (detail::write_console(fd, {buffer.data(), buffer.size()})) return;
-    }
-  }
-#endif
-  detail::ignore_unused(f);
+  detail::vformat_to(buffer, format_str, args);
+  detail::write_buffer(os, buffer);
+}
+
+}  // namespace detail
+
+FMT_EXPORT template <typename Char>
+void vprint(std::basic_ostream<Char>& os,
+            basic_string_view<type_identity_t<Char>> format_str,
+            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  auto buffer = basic_memory_buffer<Char>();
+  detail::vformat_to(buffer, format_str, args);
+  if (detail::write_ostream_unicode(os, {buffer.data(), buffer.size()})) return;
   detail::write_buffer(os, buffer);
 }
 
 /**
- * Prints formatted data to the stream `os`.
- *
- * **Example**:
- *
- *     fmt::print(cerr, "Don't {}!", "panic");
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    fmt::print(cerr, "Don't {}!", "panic");
+  \endrst
  */
 FMT_EXPORT template <typename... T>
 void print(std::ostream& os, format_string<T...> fmt, T&&... args) {
-  fmt::vargs<T...> vargs = {{args...}};
-  if (detail::use_utf8) return vprint(os, fmt.str, vargs);
-  auto buffer = memory_buffer();
-  detail::vformat_to(buffer, fmt.str, vargs);
-  detail::write_buffer(os, buffer);
+  const auto& vargs = fmt::make_format_args(args...);
+  if (detail::is_utf8())
+    vprint(os, fmt, vargs);
+  else
+    detail::vprint_directly(os, fmt, vargs);
+}
+
+FMT_EXPORT
+template <typename... Args>
+void print(std::wostream& os,
+           basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+           Args&&... args) {
+  vprint(os, fmt, fmt::make_format_args<buffer_context<wchar_t>>(args...));
 }
 
 FMT_EXPORT template <typename... T>
@@ -161,6 +232,14 @@ void println(std::ostream& os, format_string<T...> fmt, T&&... args) {
   fmt::print(os, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
 }
 
+FMT_EXPORT
+template <typename... Args>
+void println(std::wostream& os,
+             basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+             Args&&... args) {
+  print(os, L"{}\n", fmt::format(fmt, std::forward<Args>(args)...));
+}
+
 FMT_END_NAMESPACE
 
 #endif  // FMT_OSTREAM_H_
diff --git a/src/fmt/printf.h b/src/fmt/printf.h
index e726840185..35445abce2 100644
--- a/src/fmt/printf.h
+++ b/src/fmt/printf.h
@@ -8,10 +8,8 @@
 #ifndef FMT_PRINTF_H_
 #define FMT_PRINTF_H_
 
-#ifndef FMT_MODULE
-#  include <algorithm>  // std::max
-#  include <limits>     // std::numeric_limits
-#endif
+#include <algorithm>  // std::max
+#include <limits>     // std::numeric_limits
 
 #include "format.h"
 
@@ -24,7 +22,7 @@ template <typename T> struct printf_formatter {
 
 template <typename Char> class basic_printf_context {
  private:
-  basic_appender<Char> out_;
+  detail::buffer_appender<Char> out_;
   basic_format_args<basic_printf_context> args_;
 
   static_assert(std::is_same<Char, char>::value ||
@@ -33,53 +31,41 @@ template <typename Char> class basic_printf_context {
 
  public:
   using char_type = Char;
-  using parse_context_type = parse_context<Char>;
+  using parse_context_type = basic_format_parse_context<Char>;
   template <typename T> using formatter_type = printf_formatter<T>;
-  enum { builtin_types = 1 };
 
-  /// Constructs a `printf_context` object. References to the arguments are
-  /// stored in the context object so make sure they have appropriate lifetimes.
-  basic_printf_context(basic_appender<Char> out,
+  /**
+    \rst
+    Constructs a ``printf_context`` object. References to the arguments are
+    stored in the context object so make sure they have appropriate lifetimes.
+    \endrst
+   */
+  basic_printf_context(detail::buffer_appender<Char> out,
                        basic_format_args<basic_printf_context> args)
       : out_(out), args_(args) {}
 
-  auto out() -> basic_appender<Char> { return out_; }
-  void advance_to(basic_appender<Char>) {}
+  auto out() -> detail::buffer_appender<Char> { return out_; }
+  void advance_to(detail::buffer_appender<Char>) {}
 
   auto locale() -> detail::locale_ref { return {}; }
 
   auto arg(int id) const -> basic_format_arg<basic_printf_context> {
     return args_.get(id);
   }
+
+  void on_error(const char* message) { throw_format_error(message); }
 };
 
 namespace detail {
 
-// Return the result via the out param to workaround gcc bug 77539.
-template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
-FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
-  for (out = first; out != last; ++out) {
-    if (*out == value) return true;
-  }
-  return false;
-}
-
-template <>
-inline auto find<false, char>(const char* first, const char* last, char value,
-                              const char*& out) -> bool {
-  out =
-      static_cast<const char*>(memchr(first, value, to_unsigned(last - first)));
-  return out != nullptr;
-}
-
 // Checks if a value fits in int - used to avoid warnings about comparing
 // signed and unsigned integers.
 template <bool IsSigned> struct int_checker {
   template <typename T> static auto fits_in_int(T value) -> bool {
-    unsigned max = to_unsigned(max_value<int>());
+    unsigned max = max_value<int>();
     return value <= max;
   }
-  inline static auto fits_in_int(bool) -> bool { return true; }
+  static auto fits_in_int(bool) -> bool { return true; }
 };
 
 template <> struct int_checker<true> {
@@ -87,20 +73,20 @@ template <> struct int_checker<true> {
     return value >= (std::numeric_limits<int>::min)() &&
            value <= max_value<int>();
   }
-  inline static auto fits_in_int(int) -> bool { return true; }
+  static auto fits_in_int(int) -> bool { return true; }
 };
 
 struct printf_precision_handler {
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   auto operator()(T value) -> int {
     if (!int_checker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
-      report_error("number is too big");
+      throw_format_error("number is too big");
     return (std::max)(static_cast<int>(value), 0);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
   auto operator()(T) -> int {
-    report_error("precision is not integer");
+    throw_format_error("precision is not integer");
     return 0;
   }
 };
@@ -145,19 +131,25 @@ template <typename T, typename Context> class arg_converter {
     using target_type = conditional_t<std::is_same<T, void>::value, U, T>;
     if (const_check(sizeof(target_type) <= sizeof(int))) {
       // Extra casts are used to silence warnings.
-      using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
-      if (is_signed)
-        arg_ = static_cast<int>(static_cast<target_type>(value));
-      else
-        arg_ = static_cast<unsigned>(static_cast<unsigned_type>(value));
+      if (is_signed) {
+        auto n = static_cast<int>(static_cast<target_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
+        auto n = static_cast<unsigned>(static_cast<unsigned_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      }
     } else {
-      // glibc's printf doesn't sign extend arguments of smaller types:
-      //   std::printf("%lld", -42);  // prints "4294967254"
-      // but we don't have to do the same because it's a UB.
-      if (is_signed)
-        arg_ = static_cast<long long>(value);
-      else
-        arg_ = static_cast<typename make_unsigned_or_bool<U>::type>(value);
+      if (is_signed) {
+        // glibc's printf doesn't sign extend arguments of smaller types:
+        //   std::printf("%lld", -42);  // prints "4294967254"
+        // but we don't have to do the same because it's a UB.
+        auto n = static_cast<long long>(value);
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        auto n = static_cast<typename make_unsigned_or_bool<U>::type>(value);
+        arg_ = detail::make_arg<Context>(n);
+      }
     }
   }
 
@@ -184,7 +176,8 @@ template <typename Context> class char_converter {
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   void operator()(T value) {
-    arg_ = static_cast<typename Context::char_type>(value);
+    auto c = static_cast<typename Context::char_type>(value);
+    arg_ = detail::make_arg<Context>(c);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
@@ -200,28 +193,28 @@ template <typename Char> struct get_cstring {
 
 // Checks if an argument is a valid printf width specifier and sets
 // left alignment if it is negative.
-class printf_width_handler {
+template <typename Char> class printf_width_handler {
  private:
-  format_specs& specs_;
+  format_specs<Char>& specs_;
 
  public:
-  inline explicit printf_width_handler(format_specs& specs) : specs_(specs) {}
+  explicit printf_width_handler(format_specs<Char>& specs) : specs_(specs) {}
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   auto operator()(T value) -> unsigned {
     auto width = static_cast<uint32_or_64_or_128_t<T>>(value);
     if (detail::is_negative(value)) {
-      specs_.set_align(align::left);
+      specs_.align = align::left;
       width = 0 - width;
     }
-    unsigned int_max = to_unsigned(max_value<int>());
-    if (width > int_max) report_error("number is too big");
+    unsigned int_max = max_value<int>();
+    if (width > int_max) throw_format_error("number is too big");
     return static_cast<unsigned>(width);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
   auto operator()(T) -> unsigned {
-    report_error("width is not integer");
+    throw_format_error("width is not integer");
     return 0;
   }
 };
@@ -229,12 +222,12 @@ class printf_width_handler {
 // Workaround for a bug with the XL compiler when initializing
 // printf_arg_formatter's base class.
 template <typename Char>
-auto make_arg_formatter(basic_appender<Char> iter, format_specs& s)
+auto make_arg_formatter(buffer_appender<Char> iter, format_specs<Char>& s)
     -> arg_formatter<Char> {
   return {iter, s, locale_ref()};
 }
 
-// The `printf` argument formatter.
+// The ``printf`` argument formatter.
 template <typename Char>
 class printf_arg_formatter : public arg_formatter<Char> {
  private:
@@ -245,96 +238,105 @@ class printf_arg_formatter : public arg_formatter<Char> {
 
   void write_null_pointer(bool is_string = false) {
     auto s = this->specs;
-    s.set_type(presentation_type::none);
-    write_bytes<Char>(this->out, is_string ? "(null)" : "(nil)", s);
-  }
-
-  template <typename T> void write(T value) {
-    detail::write<Char>(this->out, value, this->specs, this->locale);
+    s.type = presentation_type::none;
+    write_bytes(this->out, is_string ? "(null)" : "(nil)", s);
   }
 
  public:
-  printf_arg_formatter(basic_appender<Char> iter, format_specs& s,
+  printf_arg_formatter(buffer_appender<Char> iter, format_specs<Char>& s,
                        context_type& ctx)
       : base(make_arg_formatter(iter, s)), context_(ctx) {}
 
-  void operator()(monostate value) { write(value); }
+  void operator()(monostate value) { base::operator()(value); }
 
   template <typename T, FMT_ENABLE_IF(detail::is_integral<T>::value)>
   void operator()(T value) {
     // MSVC2013 fails to compile separate overloads for bool and Char so use
     // std::is_same instead.
     if (!std::is_same<T, Char>::value) {
-      write(value);
+      base::operator()(value);
       return;
     }
-    format_specs s = this->specs;
-    if (s.type() != presentation_type::none &&
-        s.type() != presentation_type::chr) {
+    format_specs<Char> fmt_specs = this->specs;
+    if (fmt_specs.type != presentation_type::none &&
+        fmt_specs.type != presentation_type::chr) {
       return (*this)(static_cast<int>(value));
     }
-    s.set_sign(sign::none);
-    s.clear_alt();
-    s.set_fill(' ');  // Ignore '0' flag for char types.
+    fmt_specs.sign = sign::none;
+    fmt_specs.alt = false;
+    fmt_specs.fill[0] = ' ';  // Ignore '0' flag for char types.
     // align::numeric needs to be overwritten here since the '0' flag is
     // ignored for non-numeric types
-    if (s.align() == align::none || s.align() == align::numeric)
-      s.set_align(align::right);
-    detail::write<Char>(this->out, static_cast<Char>(value), s);
+    if (fmt_specs.align == align::none || fmt_specs.align == align::numeric)
+      fmt_specs.align = align::right;
+    write<Char>(this->out, static_cast<Char>(value), fmt_specs);
   }
 
   template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
   void operator()(T value) {
-    write(value);
+    base::operator()(value);
   }
 
+  /** Formats a null-terminated C string. */
   void operator()(const char* value) {
     if (value)
-      write(value);
+      base::operator()(value);
     else
-      write_null_pointer(this->specs.type() != presentation_type::pointer);
+      write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
+  /** Formats a null-terminated wide C string. */
   void operator()(const wchar_t* value) {
     if (value)
-      write(value);
+      base::operator()(value);
     else
-      write_null_pointer(this->specs.type() != presentation_type::pointer);
+      write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
-  void operator()(basic_string_view<Char> value) { write(value); }
+  void operator()(basic_string_view<Char> value) { base::operator()(value); }
 
+  /** Formats a pointer. */
   void operator()(const void* value) {
     if (value)
-      write(value);
+      base::operator()(value);
     else
       write_null_pointer();
   }
 
+  /** Formats an argument of a custom (user-defined) type. */
   void operator()(typename basic_format_arg<context_type>::handle handle) {
-    auto parse_ctx = parse_context<Char>({});
+    auto parse_ctx = basic_format_parse_context<Char>({});
     handle.format(parse_ctx, context_);
   }
 };
 
 template <typename Char>
-void parse_flags(format_specs& specs, const Char*& it, const Char* end) {
+void parse_flags(format_specs<Char>& specs, const Char*& it, const Char* end) {
   for (; it != end; ++it) {
     switch (*it) {
-    case '-': specs.set_align(align::left); break;
-    case '+': specs.set_sign(sign::plus); break;
-    case '0': specs.set_fill('0'); break;
-    case ' ':
-      if (specs.sign() != sign::plus) specs.set_sign(sign::space);
+    case '-':
+      specs.align = align::left;
       break;
-    case '#': specs.set_alt(); break;
-    default:  return;
+    case '+':
+      specs.sign = sign::plus;
+      break;
+    case '0':
+      specs.fill[0] = '0';
+      break;
+    case ' ':
+      if (specs.sign != sign::plus) specs.sign = sign::space;
+      break;
+    case '#':
+      specs.alt = true;
+      break;
+    default:
+      return;
     }
   }
 }
 
 template <typename Char, typename GetArg>
-auto parse_header(const Char*& it, const Char* end, format_specs& specs,
+auto parse_header(const Char*& it, const Char* end, format_specs<Char>& specs,
                   GetArg get_arg) -> int {
   int arg_index = -1;
   Char c = *it;
@@ -346,11 +348,11 @@ auto parse_header(const Char*& it, const Char* end, format_specs& specs,
       ++it;
       arg_index = value != -1 ? value : max_value<int>();
     } else {
-      if (c == '0') specs.set_fill('0');
+      if (c == '0') specs.fill[0] = '0';
       if (value != 0) {
         // Nonzero value means that we parsed width and don't need to
         // parse it or flags again, so return now.
-        if (value == -1) report_error("number is too big");
+        if (value == -1) throw_format_error("number is too big");
         specs.width = value;
         return arg_index;
       }
@@ -361,47 +363,63 @@ auto parse_header(const Char*& it, const Char* end, format_specs& specs,
   if (it != end) {
     if (*it >= '0' && *it <= '9') {
       specs.width = parse_nonnegative_int(it, end, -1);
-      if (specs.width == -1) report_error("number is too big");
+      if (specs.width == -1) throw_format_error("number is too big");
     } else if (*it == '*') {
       ++it;
       specs.width = static_cast<int>(
-          get_arg(-1).visit(detail::printf_width_handler(specs)));
+          get_arg(-1).visit(detail::printf_width_handler<Char>(specs)));
     }
   }
   return arg_index;
 }
 
-inline auto parse_printf_presentation_type(char c, type t, bool& upper)
+inline auto parse_printf_presentation_type(char c, type t)
     -> presentation_type {
   using pt = presentation_type;
   constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
   switch (c) {
-  case 'd': return in(t, integral_set) ? pt::dec : pt::none;
-  case 'o': return in(t, integral_set) ? pt::oct : pt::none;
-  case 'X': upper = true; FMT_FALLTHROUGH;
-  case 'x': return in(t, integral_set) ? pt::hex : pt::none;
-  case 'E': upper = true; FMT_FALLTHROUGH;
-  case 'e': return in(t, float_set) ? pt::exp : pt::none;
-  case 'F': upper = true; FMT_FALLTHROUGH;
-  case 'f': return in(t, float_set) ? pt::fixed : pt::none;
-  case 'G': upper = true; FMT_FALLTHROUGH;
-  case 'g': return in(t, float_set) ? pt::general : pt::none;
-  case 'A': upper = true; FMT_FALLTHROUGH;
-  case 'a': return in(t, float_set) ? pt::hexfloat : pt::none;
-  case 'c': return in(t, integral_set) ? pt::chr : pt::none;
-  case 's': return in(t, string_set | cstring_set) ? pt::string : pt::none;
-  case 'p': return in(t, pointer_set | cstring_set) ? pt::pointer : pt::none;
-  default:  return pt::none;
+  case 'd':
+    return in(t, integral_set) ? pt::dec : pt::none;
+  case 'o':
+    return in(t, integral_set) ? pt::oct : pt::none;
+  case 'x':
+    return in(t, integral_set) ? pt::hex_lower : pt::none;
+  case 'X':
+    return in(t, integral_set) ? pt::hex_upper : pt::none;
+  case 'a':
+    return in(t, float_set) ? pt::hexfloat_lower : pt::none;
+  case 'A':
+    return in(t, float_set) ? pt::hexfloat_upper : pt::none;
+  case 'e':
+    return in(t, float_set) ? pt::exp_lower : pt::none;
+  case 'E':
+    return in(t, float_set) ? pt::exp_upper : pt::none;
+  case 'f':
+    return in(t, float_set) ? pt::fixed_lower : pt::none;
+  case 'F':
+    return in(t, float_set) ? pt::fixed_upper : pt::none;
+  case 'g':
+    return in(t, float_set) ? pt::general_lower : pt::none;
+  case 'G':
+    return in(t, float_set) ? pt::general_upper : pt::none;
+  case 'c':
+    return in(t, integral_set) ? pt::chr : pt::none;
+  case 's':
+    return in(t, string_set | cstring_set) ? pt::string : pt::none;
+  case 'p':
+    return in(t, pointer_set | cstring_set) ? pt::pointer : pt::none;
+  default:
+    return pt::none;
   }
 }
 
 template <typename Char, typename Context>
 void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
              basic_format_args<Context> args) {
-  using iterator = basic_appender<Char>;
+  using iterator = buffer_appender<Char>;
   auto out = iterator(buf);
   auto context = basic_printf_context<Char>(out, args);
-  auto parse_ctx = parse_context<Char>(format);
+  auto parse_ctx = basic_format_parse_context<Char>(format);
 
   // Returns the argument with specified index or, if arg_index is -1, the next
   // argument.
@@ -429,12 +447,12 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
     }
     write(out, basic_string_view<Char>(start, to_unsigned(it - 1 - start)));
 
-    auto specs = format_specs();
-    specs.set_align(align::right);
+    auto specs = format_specs<Char>();
+    specs.align = align::right;
 
     // Parse argument index, flags and width.
     int arg_index = parse_header(it, end, specs, get_arg);
-    if (arg_index == 0) report_error("argument not found");
+    if (arg_index == 0) throw_format_error("argument not found");
 
     // Parse precision.
     if (it != end && *it == '.') {
@@ -454,9 +472,9 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
     auto arg = get_arg(arg_index);
     // For d, i, o, u, x, and X conversion specifiers, if a precision is
     // specified, the '0' flag is ignored
-    if (specs.precision >= 0 && is_integral_type(arg.type())) {
+    if (specs.precision >= 0 && arg.is_integral()) {
       // Ignore '0' for non-numeric types or if '-' present.
-      specs.set_fill(' ');
+      specs.fill[0] = ' ';
     }
     if (specs.precision >= 0 && arg.type() == type::cstring_type) {
       auto str = arg.visit(get_cstring<Char>());
@@ -464,16 +482,15 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
       auto nul = std::find(str, str_end, Char());
       auto sv = basic_string_view<Char>(
           str, to_unsigned(nul != str_end ? nul - str : specs.precision));
-      arg = sv;
+      arg = make_arg<basic_printf_context<Char>>(sv);
     }
-    if (specs.alt() && arg.visit(is_zero_int())) specs.clear_alt();
-    if (specs.fill_unit<Char>() == '0') {
-      if (is_arithmetic_type(arg.type()) && specs.align() != align::left) {
-        specs.set_align(align::numeric);
-      } else {
-        // Ignore '0' flag for non-numeric types or if '-' flag is also present.
-        specs.set_fill(' ');
-      }
+    if (specs.alt && arg.visit(is_zero_int())) specs.alt = false;
+    if (specs.fill[0] == '0') {
+      if (arg.is_arithmetic() && specs.align != align::left)
+        specs.align = align::numeric;
+      else
+        specs.fill[0] = ' ';  // Ignore '0' flag for non-numeric types or if '-'
+                              // flag is also present.
     }
 
     // Parse length and convert the argument to the required type.
@@ -498,34 +515,42 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
         convert_arg<long>(arg, t);
       }
       break;
-    case 'j': convert_arg<intmax_t>(arg, t); break;
-    case 'z': convert_arg<size_t>(arg, t); break;
-    case 't': convert_arg<std::ptrdiff_t>(arg, t); break;
+    case 'j':
+      convert_arg<intmax_t>(arg, t);
+      break;
+    case 'z':
+      convert_arg<size_t>(arg, t);
+      break;
+    case 't':
+      convert_arg<std::ptrdiff_t>(arg, t);
+      break;
     case 'L':
       // printf produces garbage when 'L' is omitted for long double, no
       // need to do the same.
       break;
-    default: --it; convert_arg<void>(arg, c);
+    default:
+      --it;
+      convert_arg<void>(arg, c);
     }
 
     // Parse type.
-    if (it == end) report_error("invalid format string");
+    if (it == end) throw_format_error("invalid format string");
     char type = static_cast<char>(*it++);
-    if (is_integral_type(arg.type())) {
+    if (arg.is_integral()) {
       // Normalize type.
       switch (type) {
       case 'i':
-      case 'u': type = 'd'; break;
+      case 'u':
+        type = 'd';
+        break;
       case 'c':
         arg.visit(char_converter<basic_printf_context<Char>>(arg));
         break;
       }
     }
-    bool upper = false;
-    specs.set_type(parse_printf_presentation_type(type, arg.type(), upper));
-    if (specs.type() == presentation_type::none)
-      report_error("invalid format specifier");
-    if (upper) specs.set_upper();
+    specs.type = parse_printf_presentation_type(type, arg.type());
+    if (specs.type == presentation_type::none)
+      throw_format_error("invalid format specifier");
 
     start = it;
 
@@ -542,44 +567,56 @@ using wprintf_context = basic_printf_context<wchar_t>;
 using printf_args = basic_format_args<printf_context>;
 using wprintf_args = basic_format_args<wprintf_context>;
 
-/// Constructs an `format_arg_store` object that contains references to
-/// arguments and can be implicitly converted to `printf_args`.
-template <typename Char = char, typename... T>
-inline auto make_printf_args(T&... args)
-    -> decltype(fmt::make_format_args<basic_printf_context<Char>>(args...)) {
-  return fmt::make_format_args<basic_printf_context<Char>>(args...);
+/**
+  \rst
+  Constructs an `~fmt::format_arg_store` object that contains references to
+  arguments and can be implicitly converted to `~fmt::printf_args`.
+  \endrst
+ */
+template <typename... T>
+inline auto make_printf_args(const T&... args)
+    -> format_arg_store<printf_context, T...> {
+  return {args...};
 }
 
-template <typename Char> struct vprintf_args {
-  using type = basic_format_args<basic_printf_context<Char>>;
-};
+// DEPRECATED!
+template <typename... T>
+inline auto make_wprintf_args(const T&... args)
+    -> format_arg_store<wprintf_context, T...> {
+  return {args...};
+}
 
 template <typename Char>
-inline auto vsprintf(basic_string_view<Char> fmt,
-                     typename vprintf_args<Char>::type args)
+inline auto vsprintf(
+    basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
     -> std::basic_string<Char> {
   auto buf = basic_memory_buffer<Char>();
   detail::vprintf(buf, fmt, args);
-  return {buf.data(), buf.size()};
+  return to_string(buf);
 }
 
 /**
- * Formats `args` according to specifications in `fmt` and returns the result
- * as as string.
- *
- * **Example**:
- *
- *     std::string message = fmt::sprintf("The answer is %d", 42);
- */
-template <typename S, typename... T, typename Char = detail::char_t<S>>
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    std::string message = fmt::sprintf("The answer is %d", 42);
+  \endrst
+*/
+template <typename S, typename... T,
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 inline auto sprintf(const S& fmt, const T&... args) -> std::basic_string<Char> {
   return vsprintf(detail::to_string_view(fmt),
                   fmt::make_format_args<basic_printf_context<Char>>(args...));
 }
 
 template <typename Char>
-inline auto vfprintf(std::FILE* f, basic_string_view<Char> fmt,
-                     typename vprintf_args<Char>::type args) -> int {
+inline auto vfprintf(
+    std::FILE* f, basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
+    -> int {
   auto buf = basic_memory_buffer<Char>();
   detail::vprintf(buf, fmt, args);
   size_t size = buf.size();
@@ -589,33 +626,36 @@ inline auto vfprintf(std::FILE* f, basic_string_view<Char> fmt,
 }
 
 /**
- * Formats `args` according to specifications in `fmt` and writes the output
- * to `f`.
- *
- * **Example**:
- *
- *     fmt::fprintf(stderr, "Don't %s!", "panic");
+  \rst
+  Prints formatted data to the file *f*.
+
+  **Example**::
+
+    fmt::fprintf(stderr, "Don't %s!", "panic");
+  \endrst
  */
-template <typename S, typename... T, typename Char = detail::char_t<S>>
+template <typename S, typename... T, typename Char = char_t<S>>
 inline auto fprintf(std::FILE* f, const S& fmt, const T&... args) -> int {
   return vfprintf(f, detail::to_string_view(fmt),
-                  make_printf_args<Char>(args...));
+                  fmt::make_format_args<basic_printf_context<Char>>(args...));
 }
 
 template <typename Char>
-FMT_DEPRECATED inline auto vprintf(basic_string_view<Char> fmt,
-                                   typename vprintf_args<Char>::type args)
+FMT_DEPRECATED inline auto vprintf(
+    basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
     -> int {
   return vfprintf(stdout, fmt, args);
 }
 
 /**
- * Formats `args` according to specifications in `fmt` and writes the output
- * to `stdout`.
- *
- * **Example**:
- *
- *   fmt::printf("Elapsed time: %.2f seconds", 1.23);
+  \rst
+  Prints formatted data to ``stdout``.
+
+  **Example**::
+
+    fmt::printf("Elapsed time: %.2f seconds", 1.23);
+  \endrst
  */
 template <typename... T>
 inline auto printf(string_view fmt, const T&... args) -> int {
@@ -624,7 +664,7 @@ inline auto printf(string_view fmt, const T&... args) -> int {
 template <typename... T>
 FMT_DEPRECATED inline auto printf(basic_string_view<wchar_t> fmt,
                                   const T&... args) -> int {
-  return vfprintf(stdout, fmt, make_printf_args<wchar_t>(args...));
+  return vfprintf(stdout, fmt, make_wprintf_args(args...));
 }
 
 FMT_END_EXPORT
diff --git a/src/fmt/ranges.h b/src/fmt/ranges.h
index 118d24fe81..a9cd60e594 100644
--- a/src/fmt/ranges.h
+++ b/src/fmt/ranges.h
@@ -8,31 +8,67 @@
 #ifndef FMT_RANGES_H_
 #define FMT_RANGES_H_
 
-#ifndef FMT_MODULE
-#  include <initializer_list>
-#  include <iterator>
-#  include <string>
-#  include <tuple>
-#  include <type_traits>
-#  include <utility>
-#endif
+#include <initializer_list>
+#include <tuple>
+#include <type_traits>
 
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
 
-FMT_EXPORT
-enum class range_format { disabled, map, set, sequence, string, debug_string };
-
 namespace detail {
 
+template <typename Range, typename OutputIt>
+auto copy(const Range& range, OutputIt out) -> OutputIt {
+  for (auto it = range.begin(), end = range.end(); it != end; ++it)
+    *out++ = *it;
+  return out;
+}
+
+template <typename OutputIt>
+auto copy(const char* str, OutputIt out) -> OutputIt {
+  while (*str) *out++ = *str++;
+  return out;
+}
+
+template <typename OutputIt> auto copy(char ch, OutputIt out) -> OutputIt {
+  *out++ = ch;
+  return out;
+}
+
+template <typename OutputIt> auto copy(wchar_t ch, OutputIt out) -> OutputIt {
+  *out++ = ch;
+  return out;
+}
+
+// Returns true if T has a std::string-like interface, like std::string_view.
+template <typename T> class is_std_string_like {
+  template <typename U>
+  static auto check(U* p)
+      -> decltype((void)p->find('a'), p->length(), (void)p->data(), int());
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      is_string<T>::value ||
+      std::is_convertible<T, std_string_view<char>>::value ||
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+template <typename Char>
+struct is_std_string_like<fmt::basic_string_view<Char>> : std::true_type {};
+
 template <typename T> class is_map {
   template <typename U> static auto check(U*) -> typename U::mapped_type;
   template <typename> static void check(...);
 
  public:
+#ifdef FMT_FORMAT_MAP_AS_LIST  // DEPRECATED!
+  static constexpr const bool value = false;
+#else
   static constexpr const bool value =
       !std::is_void<decltype(check<T>(nullptr))>::value;
+#endif
 };
 
 template <typename T> class is_set {
@@ -40,10 +76,26 @@ template <typename T> class is_set {
   template <typename> static void check(...);
 
  public:
+#ifdef FMT_FORMAT_SET_AS_LIST  // DEPRECATED!
+  static constexpr const bool value = false;
+#else
   static constexpr const bool value =
       !std::is_void<decltype(check<T>(nullptr))>::value && !is_map<T>::value;
+#endif
 };
 
+template <typename... Ts> struct conditional_helper {};
+
+template <typename T, typename _ = void> struct is_range_ : std::false_type {};
+
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION > 1800
+
+#  define FMT_DECLTYPE_RETURN(val)  \
+    ->decltype(val) { return val; } \
+    static_assert(                  \
+        true, "")  // This makes it so that a semicolon is required after the
+                   // macro, which helps clang-format handle the formatting.
+
 // C array overload
 template <typename T, std::size_t N>
 auto range_begin(const T (&arr)[N]) -> const T* {
@@ -58,21 +110,17 @@ template <typename T, typename Enable = void>
 struct has_member_fn_begin_end_t : std::false_type {};
 
 template <typename T>
-struct has_member_fn_begin_end_t<T, void_t<decltype(*std::declval<T>().begin()),
+struct has_member_fn_begin_end_t<T, void_t<decltype(std::declval<T>().begin()),
                                            decltype(std::declval<T>().end())>>
     : std::true_type {};
 
-// Member function overloads.
+// Member function overload
 template <typename T>
-auto range_begin(T&& rng) -> decltype(static_cast<T&&>(rng).begin()) {
-  return static_cast<T&&>(rng).begin();
-}
+auto range_begin(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).begin());
 template <typename T>
-auto range_end(T&& rng) -> decltype(static_cast<T&&>(rng).end()) {
-  return static_cast<T&&>(rng).end();
-}
+auto range_end(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).end());
 
-// ADL overloads. Only participate in overload resolution if member functions
+// ADL overload. Only participates in overload resolution if member functions
 // are not found.
 template <typename T>
 auto range_begin(T&& rng)
@@ -93,30 +141,31 @@ struct has_mutable_begin_end : std::false_type {};
 
 template <typename T>
 struct has_const_begin_end<
-    T, void_t<decltype(*detail::range_begin(
-                  std::declval<const remove_cvref_t<T>&>())),
-              decltype(detail::range_end(
-                  std::declval<const remove_cvref_t<T>&>()))>>
+    T,
+    void_t<
+        decltype(detail::range_begin(std::declval<const remove_cvref_t<T>&>())),
+        decltype(detail::range_end(std::declval<const remove_cvref_t<T>&>()))>>
     : std::true_type {};
 
 template <typename T>
 struct has_mutable_begin_end<
-    T, void_t<decltype(*detail::range_begin(std::declval<T&>())),
-              decltype(detail::range_end(std::declval<T&>())),
+    T, void_t<decltype(detail::range_begin(std::declval<T>())),
+              decltype(detail::range_end(std::declval<T>())),
               // the extra int here is because older versions of MSVC don't
               // SFINAE properly unless there are distinct types
               int>> : std::true_type {};
 
-template <typename T, typename _ = void> struct is_range_ : std::false_type {};
 template <typename T>
 struct is_range_<T, void>
     : std::integral_constant<bool, (has_const_begin_end<T>::value ||
                                     has_mutable_begin_end<T>::value)> {};
+#  undef FMT_DECLTYPE_RETURN
+#endif
 
 // tuple_size and tuple_element check.
 template <typename T> class is_tuple_like_ {
-  template <typename U, typename V = typename std::remove_cv<U>::type>
-  static auto check(U* p) -> decltype(std::tuple_size<V>::value, 0);
+  template <typename U>
+  static auto check(U* p) -> decltype(std::tuple_size<U>::value, int());
   template <typename> static void check(...);
 
  public:
@@ -157,13 +206,12 @@ class is_tuple_formattable_ {
   static constexpr const bool value = false;
 };
 template <typename T, typename C> class is_tuple_formattable_<T, C, true> {
-  template <size_t... Is>
-  static auto all_true(index_sequence<Is...>,
-                       integer_sequence<bool, (Is >= 0)...>) -> std::true_type;
-  static auto all_true(...) -> std::false_type;
-
-  template <size_t... Is>
-  static auto check(index_sequence<Is...>) -> decltype(all_true(
+  template <std::size_t... Is>
+  static auto check2(index_sequence<Is...>,
+                     integer_sequence<bool, (Is == Is)...>) -> std::true_type;
+  static auto check2(...) -> std::false_type;
+  template <std::size_t... Is>
+  static auto check(index_sequence<Is...>) -> decltype(check2(
       index_sequence<Is...>{},
       integer_sequence<bool,
                        (is_formattable<typename std::tuple_element<Is, T>::type,
@@ -244,32 +292,21 @@ FMT_CONSTEXPR auto maybe_set_debug_format(Formatter& f, bool set)
 template <typename Formatter>
 FMT_CONSTEXPR void maybe_set_debug_format(Formatter&, ...) {}
 
-template <typename T>
-struct range_format_kind_
-    : std::integral_constant<range_format,
-                             std::is_same<uncvref_type<T>, T>::value
-                                 ? range_format::disabled
-                             : is_map<T>::value ? range_format::map
-                             : is_set<T>::value ? range_format::set
-                                                : range_format::sequence> {};
-
-template <range_format K>
-using range_format_constant = std::integral_constant<range_format, K>;
-
 // These are not generic lambdas for compatibility with C++11.
-template <typename Char> struct parse_empty_specs {
+template <typename ParseContext> struct parse_empty_specs {
   template <typename Formatter> FMT_CONSTEXPR void operator()(Formatter& f) {
     f.parse(ctx);
     detail::maybe_set_debug_format(f, true);
   }
-  parse_context<Char>& ctx;
+  ParseContext& ctx;
 };
 template <typename FormatContext> struct format_tuple_element {
   using char_type = typename FormatContext::char_type;
 
   template <typename T>
   void operator()(const formatter<T, char_type>& f, const T& v) {
-    if (i > 0) ctx.advance_to(detail::copy<char_type>(separator, ctx.out()));
+    if (i > 0)
+      ctx.advance_to(detail::copy_str<char_type>(separator, ctx.out()));
     ctx.advance_to(f.format(v, ctx));
     ++i;
   }
@@ -318,48 +355,66 @@ struct formatter<Tuple, Char,
     closing_bracket_ = close;
   }
 
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     auto it = ctx.begin();
-    auto end = ctx.end();
-    if (it != end && detail::to_ascii(*it) == 'n') {
-      ++it;
-      set_brackets({}, {});
-      set_separator({});
-    }
-    if (it != end && *it != '}') report_error("invalid format specifier");
-    ctx.advance_to(it);
-    detail::for_each(formatters_, detail::parse_empty_specs<Char>{ctx});
+    if (it != ctx.end() && *it != '}')
+      FMT_THROW(format_error("invalid format specifier"));
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
     return it;
   }
 
   template <typename FormatContext>
   auto format(const Tuple& value, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    ctx.advance_to(detail::copy<Char>(opening_bracket_, ctx.out()));
+    ctx.advance_to(detail::copy_str<Char>(opening_bracket_, ctx.out()));
     detail::for_each2(
         formatters_, value,
         detail::format_tuple_element<FormatContext>{0, ctx, separator_});
-    return detail::copy<Char>(closing_bracket_, ctx.out());
+    return detail::copy_str<Char>(closing_bracket_, ctx.out());
   }
 };
 
 template <typename T, typename Char> struct is_range {
   static constexpr const bool value =
-      detail::is_range_<T>::value && !detail::has_to_string_view<T>::value;
+      detail::is_range_<T>::value && !detail::is_std_string_like<T>::value &&
+      !std::is_convertible<T, std::basic_string<Char>>::value &&
+      !std::is_convertible<T, detail::std_string_view<Char>>::value;
 };
 
 namespace detail {
+template <typename Context> struct range_mapper {
+  using mapper = arg_mapper<Context>;
+
+  template <typename T,
+            FMT_ENABLE_IF(has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value) -> T&& {
+    return static_cast<T&&>(value);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(!has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value)
+      -> decltype(mapper().map(static_cast<T&&>(value))) {
+    return mapper().map(static_cast<T&&>(value));
+  }
+};
 
 template <typename Char, typename Element>
-using range_formatter_type = formatter<remove_cvref_t<Element>, Char>;
+using range_formatter_type =
+    formatter<remove_cvref_t<decltype(range_mapper<buffer_context<Char>>{}.map(
+                  std::declval<Element>()))>,
+              Char>;
 
 template <typename R>
 using maybe_const_range =
     conditional_t<has_const_begin_end<R>::value, const R, R>;
 
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
 template <typename R, typename Char>
 struct is_formattable_delayed
     : is_formattable<uncvref_type<maybe_const_range<R>>, Char> {};
+#endif
 }  // namespace detail
 
 template <typename...> struct conjunction : std::true_type {};
@@ -383,24 +438,6 @@ struct range_formatter<
       detail::string_literal<Char, '['>{};
   basic_string_view<Char> closing_bracket_ =
       detail::string_literal<Char, ']'>{};
-  bool is_debug = false;
-
-  template <typename Output, typename It, typename Sentinel, typename U = T,
-            FMT_ENABLE_IF(std::is_same<U, Char>::value)>
-  auto write_debug_string(Output& out, It it, Sentinel end) const -> Output {
-    auto buf = basic_memory_buffer<Char>();
-    for (; it != end; ++it) buf.push_back(*it);
-    auto specs = format_specs();
-    specs.set_type(presentation_type::debug);
-    return detail::write<Char>(
-        out, basic_string_view<Char>(buf.data(), buf.size()), specs);
-  }
-
-  template <typename Output, typename It, typename Sentinel, typename U = T,
-            FMT_ENABLE_IF(!std::is_same<U, Char>::value)>
-  auto write_debug_string(Output& out, It, Sentinel) const -> Output {
-    return out;
-  }
 
  public:
   FMT_CONSTEXPR range_formatter() {}
@@ -419,40 +456,21 @@ struct range_formatter<
     closing_bracket_ = close;
   }
 
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     auto it = ctx.begin();
     auto end = ctx.end();
-    detail::maybe_set_debug_format(underlying_, true);
-    if (it == end) return underlying_.parse(ctx);
 
-    switch (detail::to_ascii(*it)) {
-    case 'n':
+    if (it != end && *it == 'n') {
       set_brackets({}, {});
       ++it;
-      break;
-    case '?':
-      is_debug = true;
-      set_brackets({}, {});
-      ++it;
-      if (it == end || *it != 's') report_error("invalid format specifier");
-      FMT_FALLTHROUGH;
-    case 's':
-      if (!std::is_same<T, Char>::value)
-        report_error("invalid format specifier");
-      if (!is_debug) {
-        set_brackets(detail::string_literal<Char, '"'>{},
-                     detail::string_literal<Char, '"'>{});
-        set_separator({});
-        detail::maybe_set_debug_format(underlying_, false);
-      }
-      ++it;
-      return it;
     }
 
     if (it != end && *it != '}') {
-      if (*it != ':') report_error("invalid format specifier");
-      detail::maybe_set_debug_format(underlying_, false);
+      if (*it != ':') FMT_THROW(format_error("invalid format specifier"));
       ++it;
+    } else {
+      detail::maybe_set_debug_format(underlying_, true);
     }
 
     ctx.advance_to(it);
@@ -461,26 +479,80 @@ struct range_formatter<
 
   template <typename R, typename FormatContext>
   auto format(R&& range, FormatContext& ctx) const -> decltype(ctx.out()) {
+    detail::range_mapper<buffer_context<Char>> mapper;
     auto out = ctx.out();
+    out = detail::copy_str<Char>(opening_bracket_, out);
+    int i = 0;
     auto it = detail::range_begin(range);
     auto end = detail::range_end(range);
-    if (is_debug) return write_debug_string(out, std::move(it), end);
-
-    out = detail::copy<Char>(opening_bracket_, out);
-    int i = 0;
     for (; it != end; ++it) {
-      if (i > 0) out = detail::copy<Char>(separator_, out);
+      if (i > 0) out = detail::copy_str<Char>(separator_, out);
       ctx.advance_to(out);
-      auto&& item = *it;  // Need an lvalue
-      out = underlying_.format(item, ctx);
+      auto&& item = *it;
+      out = underlying_.format(mapper.map(item), ctx);
       ++i;
     }
-    out = detail::copy<Char>(closing_bracket_, out);
+    out = detail::copy_str<Char>(closing_bracket_, out);
     return out;
   }
 };
 
-FMT_EXPORT
+enum class range_format { disabled, map, set, sequence, string, debug_string };
+
+namespace detail {
+template <typename T>
+struct range_format_kind_
+    : std::integral_constant<range_format,
+                             std::is_same<uncvref_type<T>, T>::value
+                                 ? range_format::disabled
+                             : is_map<T>::value ? range_format::map
+                             : is_set<T>::value ? range_format::set
+                                                : range_format::sequence> {};
+
+template <range_format K, typename R, typename Char, typename Enable = void>
+struct range_default_formatter;
+
+template <range_format K>
+using range_format_constant = std::integral_constant<range_format, K>;
+
+template <range_format K, typename R, typename Char>
+struct range_default_formatter<
+    K, R, Char,
+    enable_if_t<(K == range_format::sequence || K == range_format::map ||
+                 K == range_format::set)>> {
+  using range_type = detail::maybe_const_range<R>;
+  range_formatter<detail::uncvref_type<range_type>, Char> underlying_;
+
+  FMT_CONSTEXPR range_default_formatter() { init(range_format_constant<K>()); }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::set>) {
+    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
+                             detail::string_literal<Char, '}'>{});
+  }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::map>) {
+    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
+                             detail::string_literal<Char, '}'>{});
+    underlying_.underlying().set_brackets({}, {});
+    underlying_.underlying().set_separator(
+        detail::string_literal<Char, ':', ' '>{});
+  }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::sequence>) {}
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(range_type& range, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return underlying_.format(range, ctx);
+  }
+};
+}  // namespace detail
+
 template <typename T, typename Char, typename Enable = void>
 struct range_format_kind
     : conditional_t<
@@ -490,131 +562,16 @@ struct range_format_kind
 template <typename R, typename Char>
 struct formatter<
     R, Char,
-    enable_if_t<conjunction<
-        bool_constant<
-            range_format_kind<R, Char>::value != range_format::disabled &&
-            range_format_kind<R, Char>::value != range_format::map &&
-            range_format_kind<R, Char>::value != range_format::string &&
-            range_format_kind<R, Char>::value != range_format::debug_string>,
-        detail::is_formattable_delayed<R, Char>>::value>> {
- private:
-  using range_type = detail::maybe_const_range<R>;
-  range_formatter<detail::uncvref_type<range_type>, Char> range_formatter_;
-
- public:
-  using nonlocking = void;
-
-  FMT_CONSTEXPR formatter() {
-    if (detail::const_check(range_format_kind<R, Char>::value !=
-                            range_format::set))
-      return;
-    range_formatter_.set_brackets(detail::string_literal<Char, '{'>{},
-                                  detail::string_literal<Char, '}'>{});
-  }
-
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    return range_formatter_.parse(ctx);
-  }
-
-  template <typename FormatContext>
-  auto format(range_type& range, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    return range_formatter_.format(range, ctx);
-  }
-};
-
-// A map formatter.
-template <typename R, typename Char>
-struct formatter<
-    R, Char,
-    enable_if_t<range_format_kind<R, Char>::value == range_format::map>> {
- private:
-  using map_type = detail::maybe_const_range<R>;
-  using element_type = detail::uncvref_type<map_type>;
-
-  decltype(detail::tuple::get_formatters<element_type, Char>(
-      detail::tuple_index_sequence<element_type>())) formatters_;
-  bool no_delimiters_ = false;
-
- public:
-  FMT_CONSTEXPR formatter() {}
-
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    auto it = ctx.begin();
-    auto end = ctx.end();
-    if (it != end) {
-      if (detail::to_ascii(*it) == 'n') {
-        no_delimiters_ = true;
-        ++it;
-      }
-      if (it != end && *it != '}') {
-        if (*it != ':') report_error("invalid format specifier");
-        ++it;
-      }
-      ctx.advance_to(it);
-    }
-    detail::for_each(formatters_, detail::parse_empty_specs<Char>{ctx});
-    return it;
-  }
-
-  template <typename FormatContext>
-  auto format(map_type& map, FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto out = ctx.out();
-    basic_string_view<Char> open = detail::string_literal<Char, '{'>{};
-    if (!no_delimiters_) out = detail::copy<Char>(open, out);
-    int i = 0;
-    basic_string_view<Char> sep = detail::string_literal<Char, ',', ' '>{};
-    for (auto&& value : map) {
-      if (i > 0) out = detail::copy<Char>(sep, out);
-      ctx.advance_to(out);
-      detail::for_each2(formatters_, value,
-                        detail::format_tuple_element<FormatContext>{
-                            0, ctx, detail::string_literal<Char, ':', ' '>{}});
-      ++i;
-    }
-    basic_string_view<Char> close = detail::string_literal<Char, '}'>{};
-    if (!no_delimiters_) out = detail::copy<Char>(close, out);
-    return out;
-  }
-};
-
-// A (debug_)string formatter.
-template <typename R, typename Char>
-struct formatter<
-    R, Char,
-    enable_if_t<range_format_kind<R, Char>::value == range_format::string ||
-                range_format_kind<R, Char>::value ==
-                    range_format::debug_string>> {
- private:
-  using range_type = detail::maybe_const_range<R>;
-  using string_type =
-      conditional_t<std::is_constructible<
-                        detail::std_string_view<Char>,
-                        decltype(detail::range_begin(std::declval<R>())),
-                        decltype(detail::range_end(std::declval<R>()))>::value,
-                    detail::std_string_view<Char>, std::basic_string<Char>>;
-
-  formatter<string_type, Char> underlying_;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    return underlying_.parse(ctx);
-  }
-
-  template <typename FormatContext>
-  auto format(range_type& range, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    auto out = ctx.out();
-    if (detail::const_check(range_format_kind<R, Char>::value ==
-                            range_format::debug_string))
-      *out++ = '"';
-    out = underlying_.format(
-        string_type{detail::range_begin(range), detail::range_end(range)}, ctx);
-    if (detail::const_check(range_format_kind<R, Char>::value ==
-                            range_format::debug_string))
-      *out++ = '"';
-    return out;
-  }
+    enable_if_t<conjunction<bool_constant<range_format_kind<R, Char>::value !=
+                                          range_format::disabled>
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+                            ,
+                            detail::is_formattable_delayed<R, Char>
+#endif
+                            >::value>>
+    : detail::range_default_formatter<range_format_kind<R, Char>::value, R,
+                                      Char> {
 };
 
 template <typename It, typename Sentinel, typename Char = char>
@@ -624,7 +581,7 @@ struct join_view : detail::view {
   basic_string_view<Char> sep;
 
   join_view(It b, Sentinel e, basic_string_view<Char> s)
-      : begin(std::move(b)), end(e), sep(s) {}
+      : begin(b), end(e), sep(s) {}
 };
 
 template <typename It, typename Sentinel, typename Char>
@@ -638,41 +595,67 @@ struct formatter<join_view<It, Sentinel, Char>, Char> {
 #endif
   formatter<remove_cvref_t<value_type>, Char> value_formatter_;
 
-  using view = conditional_t<std::is_copy_constructible<It>::value,
-                             const join_view<It, Sentinel, Char>,
-                             join_view<It, Sentinel, Char>>;
-
  public:
-  using nonlocking = void;
-
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
     return value_formatter_.parse(ctx);
   }
 
   template <typename FormatContext>
-  auto format(view& value, FormatContext& ctx) const -> decltype(ctx.out()) {
-    using iter =
-        conditional_t<std::is_copy_constructible<view>::value, It, It&>;
-    iter it = value.begin;
+  auto format(const join_view<It, Sentinel, Char>& value,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto it = value.begin;
     auto out = ctx.out();
-    if (it == value.end) return out;
-    out = value_formatter_.format(*it, ctx);
-    ++it;
-    while (it != value.end) {
-      out = detail::copy<Char>(value.sep.begin(), value.sep.end(), out);
-      ctx.advance_to(out);
+    if (it != value.end) {
       out = value_formatter_.format(*it, ctx);
       ++it;
+      while (it != value.end) {
+        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
+        ctx.advance_to(out);
+        out = value_formatter_.format(*it, ctx);
+        ++it;
+      }
     }
     return out;
   }
 };
 
-template <typename Char, typename Tuple> struct tuple_join_view : detail::view {
-  const Tuple& tuple;
+/**
+  Returns a view that formats the iterator range `[begin, end)` with elements
+  separated by `sep`.
+ */
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
+  return {begin, end, sep};
+}
+
+/**
+  \rst
+  Returns a view that formats `range` with elements separated by `sep`.
+
+  **Example**::
+
+    std::vector<int> v = {1, 2, 3};
+    fmt::print("{}", fmt::join(v, ", "));
+    // Output: "1, 2, 3"
+
+  ``fmt::join`` applies passed format specifiers to the range elements::
+
+    fmt::print("{:02}", fmt::join(v, ", "));
+    // Output: "01, 02, 03"
+  \endrst
+ */
+template <typename Range>
+auto join(Range&& range, string_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename Char, typename... T> struct tuple_join_view : detail::view {
+  const std::tuple<T...>& tuple;
   basic_string_view<Char> sep;
 
-  tuple_join_view(const Tuple& t, basic_string_view<Char> s)
+  tuple_join_view(const std::tuple<T...>& t, basic_string_view<Char> s)
       : tuple(t), sep{s} {}
 };
 
@@ -683,64 +666,65 @@ template <typename Char, typename Tuple> struct tuple_join_view : detail::view {
 #  define FMT_TUPLE_JOIN_SPECIFIERS 0
 #endif
 
-template <typename Char, typename Tuple>
-struct formatter<tuple_join_view<Char, Tuple>, Char,
-                 enable_if_t<is_tuple_like<Tuple>::value>> {
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    return do_parse(ctx, std::tuple_size<Tuple>());
+template <typename Char, typename... T>
+struct formatter<tuple_join_view<Char, T...>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return do_parse(ctx, std::integral_constant<size_t, sizeof...(T)>());
   }
 
   template <typename FormatContext>
-  auto format(const tuple_join_view<Char, Tuple>& value,
+  auto format(const tuple_join_view<Char, T...>& value,
               FormatContext& ctx) const -> typename FormatContext::iterator {
-    return do_format(value, ctx, std::tuple_size<Tuple>());
+    return do_format(value, ctx,
+                     std::integral_constant<size_t, sizeof...(T)>());
   }
 
  private:
-  decltype(detail::tuple::get_formatters<Tuple, Char>(
-      detail::tuple_index_sequence<Tuple>())) formatters_;
+  std::tuple<formatter<typename std::decay<T>::type, Char>...> formatters_;
 
-  FMT_CONSTEXPR auto do_parse(parse_context<Char>& ctx,
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
                               std::integral_constant<size_t, 0>)
-      -> const Char* {
+      -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
-  template <size_t N>
-  FMT_CONSTEXPR auto do_parse(parse_context<Char>& ctx,
+  template <typename ParseContext, size_t N>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
                               std::integral_constant<size_t, N>)
-      -> const Char* {
+      -> decltype(ctx.begin()) {
     auto end = ctx.begin();
 #if FMT_TUPLE_JOIN_SPECIFIERS
-    end = std::get<std::tuple_size<Tuple>::value - N>(formatters_).parse(ctx);
+    end = std::get<sizeof...(T) - N>(formatters_).parse(ctx);
     if (N > 1) {
       auto end1 = do_parse(ctx, std::integral_constant<size_t, N - 1>());
       if (end != end1)
-        report_error("incompatible format specs for tuple elements");
+        FMT_THROW(format_error("incompatible format specs for tuple elements"));
     }
 #endif
     return end;
   }
 
   template <typename FormatContext>
-  auto do_format(const tuple_join_view<Char, Tuple>&, FormatContext& ctx,
+  auto do_format(const tuple_join_view<Char, T...>&, FormatContext& ctx,
                  std::integral_constant<size_t, 0>) const ->
       typename FormatContext::iterator {
     return ctx.out();
   }
 
   template <typename FormatContext, size_t N>
-  auto do_format(const tuple_join_view<Char, Tuple>& value, FormatContext& ctx,
+  auto do_format(const tuple_join_view<Char, T...>& value, FormatContext& ctx,
                  std::integral_constant<size_t, N>) const ->
       typename FormatContext::iterator {
-    using std::get;
-    auto out =
-        std::get<std::tuple_size<Tuple>::value - N>(formatters_)
-            .format(get<std::tuple_size<Tuple>::value - N>(value.tuple), ctx);
-    if (N <= 1) return out;
-    out = detail::copy<Char>(value.sep, out);
-    ctx.advance_to(out);
-    return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
+    auto out = std::get<sizeof...(T) - N>(formatters_)
+                   .format(std::get<sizeof...(T) - N>(value.tuple), ctx);
+    if (N > 1) {
+      out = std::copy(value.sep.begin(), value.sep.end(), out);
+      ctx.advance_to(out);
+      return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
+    }
+    return out;
   }
 };
 
@@ -784,57 +768,33 @@ struct formatter<
 
 FMT_BEGIN_EXPORT
 
-/// Returns a view that formats the iterator range `[begin, end)` with elements
-/// separated by `sep`.
-template <typename It, typename Sentinel>
-auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
-  return {std::move(begin), end, sep};
-}
-
 /**
- * Returns a view that formats `range` with elements separated by `sep`.
- *
- * **Example**:
- *
- *     auto v = std::vector<int>{1, 2, 3};
- *     fmt::print("{}", fmt::join(v, ", "));
- *     // Output: 1, 2, 3
- *
- * `fmt::join` applies passed format specifiers to the range elements:
- *
- *     fmt::print("{:02}", fmt::join(v, ", "));
- *     // Output: 01, 02, 03
- */
-template <typename Range, FMT_ENABLE_IF(!is_tuple_like<Range>::value)>
-auto join(Range&& r, string_view sep)
-    -> join_view<decltype(detail::range_begin(r)),
-                 decltype(detail::range_end(r))> {
-  return {detail::range_begin(r), detail::range_end(r), sep};
-}
+  \rst
+  Returns an object that formats `tuple` with elements separated by `sep`.
 
-/**
- * Returns an object that formats `std::tuple` with elements separated by `sep`.
- *
- * **Example**:
- *
- *     auto t = std::tuple<int, char>{1, 'a'};
- *     fmt::print("{}", fmt::join(t, ", "));
- *     // Output: 1, a
+  **Example**::
+
+    std::tuple<int, char> t = {1, 'a'};
+    fmt::print("{}", fmt::join(t, ", "));
+    // Output: "1, a"
+  \endrst
  */
-template <typename Tuple, FMT_ENABLE_IF(is_tuple_like<Tuple>::value)>
-FMT_CONSTEXPR auto join(const Tuple& tuple, string_view sep)
-    -> tuple_join_view<char, Tuple> {
+template <typename... T>
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
+    -> tuple_join_view<char, T...> {
   return {tuple, sep};
 }
 
 /**
- * Returns an object that formats `std::initializer_list` with elements
- * separated by `sep`.
- *
- * **Example**:
- *
- *     fmt::print("{}", fmt::join({1, 2, 3}, ", "));
- *     // Output: "1, 2, 3"
+  \rst
+  Returns an object that formats `initializer_list` with elements separated by
+  `sep`.
+
+  **Example**::
+
+    fmt::print("{}", fmt::join({1, 2, 3}, ", "));
+    // Output: "1, 2, 3"
+  \endrst
  */
 template <typename T>
 auto join(std::initializer_list<T> list, string_view sep)
diff --git a/src/fmt/std.h b/src/fmt/std.h
index 54eb2c2a73..7cff115920 100644
--- a/src/fmt/std.h
+++ b/src/fmt/std.h
@@ -8,49 +8,39 @@
 #ifndef FMT_STD_H_
 #define FMT_STD_H_
 
+#include <atomic>
+#include <bitset>
+#include <cstdlib>
+#include <exception>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
 #include "format.h"
 #include "ostream.h"
 
-#ifndef FMT_MODULE
-#  include <atomic>
-#  include <bitset>
-#  include <complex>
-#  include <cstdlib>
-#  include <exception>
-#  include <functional>
-#  include <memory>
-#  include <thread>
-#  include <type_traits>
-#  include <typeinfo>
-#  include <utility>
-#  include <vector>
-
-// Check FMT_CPLUSPLUS to suppress a bogus warning in MSVC.
-#  if FMT_CPLUSPLUS >= 201703L
-#    if FMT_HAS_INCLUDE(<filesystem>) && \
-        (!defined(FMT_CPP_LIB_FILESYSTEM) || FMT_CPP_LIB_FILESYSTEM != 0)
-#      include <filesystem>
-#    endif
-#    if FMT_HAS_INCLUDE(<variant>)
-#      include <variant>
-#    endif
-#    if FMT_HAS_INCLUDE(<optional>)
-#      include <optional>
-#    endif
-#  endif
-// Use > instead of >= in the version check because <source_location> may be
-// available after C++17 but before C++20 is marked as implemented.
-#  if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
-#    include <source_location>
-#  endif
-#  if FMT_CPLUSPLUS > 202002L && FMT_HAS_INCLUDE(<expected>)
-#    include <expected>
-#  endif
-#endif  // FMT_MODULE
-
 #if FMT_HAS_INCLUDE(<version>)
 #  include <version>
 #endif
+// Checking FMT_CPLUSPLUS for warning suppression in MSVC.
+#if FMT_CPLUSPLUS >= 201703L
+#  if FMT_HAS_INCLUDE(<filesystem>)
+#    include <filesystem>
+#  endif
+#  if FMT_HAS_INCLUDE(<variant>)
+#    include <variant>
+#  endif
+#  if FMT_HAS_INCLUDE(<optional>)
+#    include <optional>
+#  endif
+#endif
+
+#if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
+#  include <source_location>
+#endif
 
 // GCC 4 does not support FMT_HAS_INCLUDE.
 #if FMT_HAS_INCLUDE(<cxxabi.h>) || defined(__GLIBCXX__)
@@ -62,6 +52,17 @@
 #  endif
 #endif
 
+// Check if typeid is available.
+#ifndef FMT_USE_TYPEID
+// __RTTI is for EDG compilers. In MSVC typeid is available without RTTI.
+#  if defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || FMT_MSC_VERSION || \
+      defined(__INTEL_RTTI__) || defined(__RTTI)
+#    define FMT_USE_TYPEID 1
+#  else
+#    define FMT_USE_TYPEID 0
+#  endif
+#endif
+
 // For older Xcode versions, __cpp_lib_xxx flags are inaccurately defined.
 #ifndef FMT_CPP_LIB_FILESYSTEM
 #  ifdef __cpp_lib_filesystem
@@ -116,7 +117,7 @@ void write_escaped_path(basic_memory_buffer<Char>& quoted,
 FMT_EXPORT
 template <typename Char> struct formatter<std::filesystem::path, Char> {
  private:
-  format_specs specs_;
+  format_specs<Char> specs_;
   detail::arg_ref<Char> width_ref_;
   bool debug_ = false;
   char path_type_ = 0;
@@ -124,33 +125,33 @@ template <typename Char> struct formatter<std::filesystem::path, Char> {
  public:
   FMT_CONSTEXPR void set_debug_format(bool set = true) { debug_ = set; }
 
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
     auto it = ctx.begin(), end = ctx.end();
     if (it == end) return it;
 
     it = detail::parse_align(it, end, specs_);
     if (it == end) return it;
 
-    Char c = *it;
-    if ((c >= '0' && c <= '9') || c == '{')
-      it = detail::parse_width(it, end, specs_, width_ref_, ctx);
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
     if (it != end && *it == '?') {
       debug_ = true;
       ++it;
     }
-    if (it != end && (*it == 'g')) path_type_ = detail::to_ascii(*it++);
+    if (it != end && (*it == 'g')) path_type_ = *it++;
     return it;
   }
 
   template <typename FormatContext>
   auto format(const std::filesystem::path& p, FormatContext& ctx) const {
     auto specs = specs_;
-    auto path_string =
-        !path_type_ ? p.native()
-                    : p.generic_string<std::filesystem::path::value_type>();
+#  ifdef _WIN32
+    auto path_string = !path_type_ ? p.native() : p.generic_wstring();
+#  else
+    auto path_string = !path_type_ ? p.native() : p.generic_string();
+#  endif
 
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width, width_ref_,
-                                ctx);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
     if (!debug_) {
       auto s = detail::get_path_string<Char>(p, path_string);
       return detail::write(ctx.out(), basic_string_view<Char>(s), specs);
@@ -162,30 +163,13 @@ template <typename Char> struct formatter<std::filesystem::path, Char> {
                          specs);
   }
 };
-
-class path : public std::filesystem::path {
- public:
-  auto display_string() const -> std::string {
-    const std::filesystem::path& base = *this;
-    return fmt::format(FMT_STRING("{}"), base);
-  }
-  auto system_string() const -> std::string { return string(); }
-
-  auto generic_display_string() const -> std::string {
-    const std::filesystem::path& base = *this;
-    return fmt::format(FMT_STRING("{:g}"), base);
-  }
-  auto generic_system_string() const -> std::string { return generic_string(); }
-};
-
 FMT_END_NAMESPACE
 #endif  // FMT_CPP_LIB_FILESYSTEM
 
 FMT_BEGIN_NAMESPACE
 FMT_EXPORT
 template <std::size_t N, typename Char>
-struct formatter<std::bitset<N>, Char>
-    : nested_formatter<basic_string_view<Char>, Char> {
+struct formatter<std::bitset<N>, Char> : nested_formatter<string_view> {
  private:
   // Functor because C++11 doesn't support generic lambdas.
   struct writer {
@@ -205,7 +189,7 @@ struct formatter<std::bitset<N>, Char>
   template <typename FormatContext>
   auto format(const std::bitset<N>& bs, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    return this->write_padded(ctx, writer{bs});
+    return write_padded(ctx, writer{bs});
   }
 };
 
@@ -238,7 +222,7 @@ struct formatter<std::optional<T>, Char,
   FMT_CONSTEXPR static void maybe_set_debug_format(U&, ...) {}
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
     maybe_set_debug_format(underlying_, true);
     return underlying_.parse(ctx);
   }
@@ -258,62 +242,13 @@ struct formatter<std::optional<T>, Char,
 FMT_END_NAMESPACE
 #endif  // __cpp_lib_optional
 
-#if defined(__cpp_lib_expected) || FMT_CPP_LIB_VARIANT
-
-FMT_BEGIN_NAMESPACE
-namespace detail {
-
-template <typename Char, typename OutputIt, typename T>
-auto write_escaped_alternative(OutputIt out, const T& v) -> OutputIt {
-  if constexpr (has_to_string_view<T>::value)
-    return write_escaped_string<Char>(out, detail::to_string_view(v));
-  if constexpr (std::is_same_v<T, Char>) return write_escaped_char(out, v);
-  return write<Char>(out, v);
-}
-
-}  // namespace detail
-
-FMT_END_NAMESPACE
-#endif
-
-#ifdef __cpp_lib_expected
-FMT_BEGIN_NAMESPACE
-
-FMT_EXPORT
-template <typename T, typename E, typename Char>
-struct formatter<std::expected<T, E>, Char,
-                 std::enable_if_t<(std::is_void<T>::value ||
-                                   is_formattable<T, Char>::value) &&
-                                  is_formattable<E, Char>::value>> {
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    return ctx.begin();
-  }
-
-  template <typename FormatContext>
-  auto format(const std::expected<T, E>& value, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    auto out = ctx.out();
-
-    if (value.has_value()) {
-      out = detail::write<Char>(out, "expected(");
-      if constexpr (!std::is_void<T>::value)
-        out = detail::write_escaped_alternative<Char>(out, *value);
-    } else {
-      out = detail::write<Char>(out, "unexpected(");
-      out = detail::write_escaped_alternative<Char>(out, value.error());
-    }
-    *out++ = ')';
-    return out;
-  }
-};
-FMT_END_NAMESPACE
-#endif  // __cpp_lib_expected
-
 #ifdef __cpp_lib_source_location
 FMT_BEGIN_NAMESPACE
 FMT_EXPORT
 template <> struct formatter<std::source_location> {
-  FMT_CONSTEXPR auto parse(parse_context<>& ctx) { return ctx.begin(); }
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
 
   template <typename FormatContext>
   auto format(const std::source_location& loc, FormatContext& ctx) const
@@ -356,6 +291,16 @@ template <typename T, typename C> class is_variant_formattable_ {
       decltype(check(variant_index_sequence<T>{}))::value;
 };
 
+template <typename Char, typename OutputIt, typename T>
+auto write_variant_alternative(OutputIt out, const T& v) -> OutputIt {
+  if constexpr (is_string<T>::value)
+    return write_escaped_string<Char>(out, detail::to_string_view(v));
+  else if constexpr (std::is_same_v<T, Char>)
+    return write_escaped_char(out, v);
+  else
+    return write<Char>(out, v);
+}
+
 }  // namespace detail
 
 template <typename T> struct is_variant_like {
@@ -369,7 +314,8 @@ template <typename T, typename C> struct is_variant_formattable {
 
 FMT_EXPORT
 template <typename Char> struct formatter<std::monostate, Char> {
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
@@ -386,7 +332,8 @@ struct formatter<
     Variant, Char,
     std::enable_if_t<std::conjunction_v<
         is_variant_like<Variant>, is_variant_formattable<Variant, Char>>>> {
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
@@ -399,7 +346,7 @@ struct formatter<
     FMT_TRY {
       std::visit(
           [&](const auto& v) {
-            out = detail::write_escaped_alternative<Char>(out, v);
+            out = detail::write_variant_alternative<Char>(out, v);
           },
           value);
     }
@@ -415,127 +362,22 @@ FMT_END_NAMESPACE
 
 FMT_BEGIN_NAMESPACE
 FMT_EXPORT
-template <> struct formatter<std::error_code> {
- private:
-  format_specs specs_;
-  detail::arg_ref<char> width_ref_;
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<>& ctx) -> const char* {
-    auto it = ctx.begin(), end = ctx.end();
-    if (it == end) return it;
-
-    it = detail::parse_align(it, end, specs_);
-    if (it == end) return it;
-
-    char c = *it;
-    if ((c >= '0' && c <= '9') || c == '{')
-      it = detail::parse_width(it, end, specs_, width_ref_, ctx);
-    return it;
-  }
-
-  template <typename FormatContext>
-  FMT_CONSTEXPR20 auto format(const std::error_code& ec,
-                              FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto specs = specs_;
-    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width, width_ref_,
-                                ctx);
-    memory_buffer buf;
-    buf.append(string_view(ec.category().name()));
-    buf.push_back(':');
-    detail::write<char>(appender(buf), ec.value());
-    return detail::write<char>(ctx.out(), string_view(buf.data(), buf.size()),
-                               specs);
-  }
-};
-
-#if FMT_USE_RTTI
-namespace detail {
-
-template <typename Char, typename OutputIt>
-auto write_demangled_name(OutputIt out, const std::type_info& ti) -> OutputIt {
-#  ifdef FMT_HAS_ABI_CXA_DEMANGLE
-  int status = 0;
-  std::size_t size = 0;
-  std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
-      abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
-
-  string_view demangled_name_view;
-  if (demangled_name_ptr) {
-    demangled_name_view = demangled_name_ptr.get();
-
-    // Normalization of stdlib inline namespace names.
-    // libc++ inline namespaces.
-    //  std::__1::*       -> std::*
-    //  std::__1::__fs::* -> std::*
-    // libstdc++ inline namespaces.
-    //  std::__cxx11::*             -> std::*
-    //  std::filesystem::__cxx11::* -> std::filesystem::*
-    if (demangled_name_view.starts_with("std::")) {
-      char* begin = demangled_name_ptr.get();
-      char* to = begin + 5;  // std::
-      for (char *from = to, *end = begin + demangled_name_view.size();
-           from < end;) {
-        // This is safe, because demangled_name is NUL-terminated.
-        if (from[0] == '_' && from[1] == '_') {
-          char* next = from + 1;
-          while (next < end && *next != ':') next++;
-          if (next[0] == ':' && next[1] == ':') {
-            from = next + 2;
-            continue;
-          }
-        }
-        *to++ = *from++;
-      }
-      demangled_name_view = {begin, detail::to_unsigned(to - begin)};
-    }
-  } else {
-    demangled_name_view = string_view(ti.name());
-  }
-  return detail::write_bytes<Char>(out, demangled_name_view);
-#  elif FMT_MSC_VERSION
-  const string_view demangled_name(ti.name());
-  for (std::size_t i = 0; i < demangled_name.size(); ++i) {
-    auto sub = demangled_name;
-    sub.remove_prefix(i);
-    if (sub.starts_with("enum ")) {
-      i += 4;
-      continue;
-    }
-    if (sub.starts_with("class ") || sub.starts_with("union ")) {
-      i += 5;
-      continue;
-    }
-    if (sub.starts_with("struct ")) {
-      i += 6;
-      continue;
-    }
-    if (*sub.begin() != ' ') *out++ = *sub.begin();
-  }
-  return out;
-#  else
-  return detail::write_bytes<Char>(out, string_view(ti.name()));
-#  endif
-}
-
-}  // namespace detail
-
-FMT_EXPORT
-template <typename Char>
-struct formatter<std::type_info, Char  // DEPRECATED! Mixing code unit types.
-                 > {
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+template <typename Char> struct formatter<std::error_code, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
-  template <typename Context>
-  auto format(const std::type_info& ti, Context& ctx) const
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const std::error_code& ec, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    return detail::write_demangled_name<Char>(ctx.out(), ti);
+    auto out = ctx.out();
+    out = detail::write_bytes(out, ec.category().name(), format_specs<Char>());
+    out = detail::write<Char>(out, Char(':'));
+    out = detail::write<Char>(out, ec.value());
+    return out;
   }
 };
-#endif
 
 FMT_EXPORT
 template <typename T, typename Char>
@@ -546,29 +388,81 @@ struct formatter<
   bool with_typename_ = false;
 
  public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
     auto it = ctx.begin();
     auto end = ctx.end();
     if (it == end || *it == '}') return it;
     if (*it == 't') {
       ++it;
-      with_typename_ = FMT_USE_RTTI != 0;
+      with_typename_ = FMT_USE_TYPEID != 0;
     }
     return it;
   }
 
-  template <typename Context>
-  auto format(const std::exception& ex, Context& ctx) const
-      -> decltype(ctx.out()) {
+  template <typename OutputIt>
+  auto format(const std::exception& ex,
+              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
+    format_specs<Char> spec;
     auto out = ctx.out();
-#if FMT_USE_RTTI
-    if (with_typename_) {
-      out = detail::write_demangled_name<Char>(out, typeid(ex));
-      *out++ = ':';
-      *out++ = ' ';
+    if (!with_typename_)
+      return detail::write_bytes(out, string_view(ex.what()), spec);
+
+#if FMT_USE_TYPEID
+    const std::type_info& ti = typeid(ex);
+#  ifdef FMT_HAS_ABI_CXA_DEMANGLE
+    int status = 0;
+    std::size_t size = 0;
+    std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
+        abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
+
+    string_view demangled_name_view;
+    if (demangled_name_ptr) {
+      demangled_name_view = demangled_name_ptr.get();
+
+      // Normalization of stdlib inline namespace names.
+      // libc++ inline namespaces.
+      //  std::__1::*       -> std::*
+      //  std::__1::__fs::* -> std::*
+      // libstdc++ inline namespaces.
+      //  std::__cxx11::*             -> std::*
+      //  std::filesystem::__cxx11::* -> std::filesystem::*
+      if (demangled_name_view.starts_with("std::")) {
+        char* begin = demangled_name_ptr.get();
+        char* to = begin + 5;  // std::
+        for (char *from = to, *end = begin + demangled_name_view.size();
+             from < end;) {
+          // This is safe, because demangled_name is NUL-terminated.
+          if (from[0] == '_' && from[1] == '_') {
+            char* next = from + 1;
+            while (next < end && *next != ':') next++;
+            if (next[0] == ':' && next[1] == ':') {
+              from = next + 2;
+              continue;
+            }
+          }
+          *to++ = *from++;
+        }
+        demangled_name_view = {begin, detail::to_unsigned(to - begin)};
+      }
+    } else {
+      demangled_name_view = string_view(ti.name());
     }
+    out = detail::write_bytes(out, demangled_name_view, spec);
+#  elif FMT_MSC_VERSION
+    string_view demangled_name_view(ti.name());
+    if (demangled_name_view.starts_with("class "))
+      demangled_name_view.remove_prefix(6);
+    else if (demangled_name_view.starts_with("struct "))
+      demangled_name_view.remove_prefix(7);
+    out = detail::write_bytes(out, demangled_name_view, spec);
+#  else
+    out = detail::write_bytes(out, string_view(ti.name()), spec);
+#  endif
+    *out++ = ':';
+    *out++ = ' ';
+    return detail::write_bytes(out, string_view(ex.what()), spec);
 #endif
-    return detail::write_bytes<Char>(out, string_view(ex.what()));
   }
 };
 
@@ -615,14 +509,6 @@ struct formatter<BitRef, Char,
   }
 };
 
-template <typename T, typename Deleter>
-auto ptr(const std::unique_ptr<T, Deleter>& p) -> const void* {
-  return p.get();
-}
-template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
-  return p.get();
-}
-
 FMT_EXPORT
 template <typename T, typename Char>
 struct formatter<std::atomic<T>, Char,
@@ -647,80 +533,5 @@ struct formatter<std::atomic_flag, Char> : formatter<bool, Char> {
 };
 #endif  // __cpp_lib_atomic_flag_test
 
-FMT_EXPORT
-template <typename T, typename Char> struct formatter<std::complex<T>, Char> {
- private:
-  detail::dynamic_format_specs<Char> specs_;
-
-  template <typename FormatContext, typename OutputIt>
-  FMT_CONSTEXPR auto do_format(const std::complex<T>& c,
-                               detail::dynamic_format_specs<Char>& specs,
-                               FormatContext& ctx, OutputIt out) const
-      -> OutputIt {
-    if (c.real() != 0) {
-      *out++ = Char('(');
-      out = detail::write<Char>(out, c.real(), specs, ctx.locale());
-      specs.set_sign(sign::plus);
-      out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
-      if (!detail::isfinite(c.imag())) *out++ = Char(' ');
-      *out++ = Char('i');
-      *out++ = Char(')');
-      return out;
-    }
-    out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
-    if (!detail::isfinite(c.imag())) *out++ = Char(' ');
-    *out++ = Char('i');
-    return out;
-  }
-
- public:
-  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
-    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
-    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
-                              detail::type_constant<T, Char>::value);
-  }
-
-  template <typename FormatContext>
-  auto format(const std::complex<T>& c, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    auto specs = specs_;
-    if (specs.dynamic()) {
-      detail::handle_dynamic_spec(specs.dynamic_width(), specs.width,
-                                  specs.width_ref, ctx);
-      detail::handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
-                                  specs.precision_ref, ctx);
-    }
-
-    if (specs.width == 0) return do_format(c, specs, ctx, ctx.out());
-    auto buf = basic_memory_buffer<Char>();
-
-    auto outer_specs = format_specs();
-    outer_specs.width = specs.width;
-    outer_specs.copy_fill_from(specs);
-    outer_specs.set_align(specs.align());
-
-    specs.width = 0;
-    specs.set_fill({});
-    specs.set_align(align::none);
-
-    do_format(c, specs, ctx, basic_appender<Char>(buf));
-    return detail::write<Char>(ctx.out(),
-                               basic_string_view<Char>(buf.data(), buf.size()),
-                               outer_specs);
-  }
-};
-
-FMT_EXPORT
-template <typename T, typename Char>
-struct formatter<std::reference_wrapper<T>, Char,
-                 enable_if_t<is_formattable<remove_cvref_t<T>, Char>::value>>
-    : formatter<remove_cvref_t<T>, Char> {
-  template <typename FormatContext>
-  auto format(std::reference_wrapper<T> ref, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    return formatter<remove_cvref_t<T>, Char>::format(ref.get(), ctx);
-  }
-};
-
 FMT_END_NAMESPACE
 #endif  // FMT_STD_H_
diff --git a/src/fmt/xchar.h b/src/fmt/xchar.h
index 9f7f889d64..1e791bb07b 100644
--- a/src/fmt/xchar.h
+++ b/src/fmt/xchar.h
@@ -8,16 +8,13 @@
 #ifndef FMT_XCHAR_H_
 #define FMT_XCHAR_H_
 
-#include "color.h"
+#include <cwchar>
+
 #include "format.h"
-#include "ostream.h"
 #include "ranges.h"
 
-#ifndef FMT_MODULE
-#  include <cwchar>
-#  if FMT_USE_LOCALE
-#    include <locale>
-#  endif
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#  include <locale>
 #endif
 
 FMT_BEGIN_NAMESPACE
@@ -26,26 +23,10 @@ namespace detail {
 template <typename T>
 using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
 
-template <typename S, typename = void> struct format_string_char {};
-
-template <typename S>
-struct format_string_char<
-    S, void_t<decltype(sizeof(detail::to_string_view(std::declval<S>())))>> {
-  using type = char_t<S>;
-};
-
-template <typename S>
-struct format_string_char<
-    S, enable_if_t<std::is_base_of<detail::compile_string, S>::value>> {
-  using type = typename S::char_type;
-};
-
-template <typename S>
-using format_string_char_t = typename format_string_char<S>::type;
-
-inline auto write_loc(basic_appender<wchar_t> out, loc_value value,
-                      const format_specs& specs, locale_ref loc) -> bool {
-#if FMT_USE_LOCALE
+inline auto write_loc(back_insert_iterator<detail::buffer<wchar_t>> out,
+                      loc_value value, const format_specs<wchar_t>& specs,
+                      locale_ref loc) -> bool {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
   auto& numpunct =
       std::use_facet<std::numpunct<wchar_t>>(loc.get<std::locale>());
   auto separator = std::wstring();
@@ -60,79 +41,42 @@ inline auto write_loc(basic_appender<wchar_t> out, loc_value value,
 FMT_BEGIN_EXPORT
 
 using wstring_view = basic_string_view<wchar_t>;
-using wformat_parse_context = parse_context<wchar_t>;
-using wformat_context = buffered_context<wchar_t>;
+using wformat_parse_context = basic_format_parse_context<wchar_t>;
+using wformat_context = buffer_context<wchar_t>;
 using wformat_args = basic_format_args<wformat_context>;
 using wmemory_buffer = basic_memory_buffer<wchar_t>;
 
-template <typename Char, typename... T> struct basic_fstring {
- private:
-  basic_string_view<Char> str_;
-
-  static constexpr int num_static_named_args =
-      detail::count_static_named_args<T...>();
-
-  using checker = detail::format_string_checker<
-      Char, static_cast<int>(sizeof...(T)), num_static_named_args,
-      num_static_named_args != detail::count_named_args<T...>()>;
-
-  using arg_pack = detail::arg_pack<T...>;
-
- public:
-  using t = basic_fstring;
-
-  template <typename S,
-            FMT_ENABLE_IF(
-                std::is_convertible<const S&, basic_string_view<Char>>::value)>
-  FMT_CONSTEVAL FMT_ALWAYS_INLINE basic_fstring(const S& s) : str_(s) {
-    if (FMT_USE_CONSTEVAL)
-      detail::parse_format_string<Char>(s, checker(s, arg_pack()));
-  }
-  template <typename S,
-            FMT_ENABLE_IF(std::is_base_of<detail::compile_string, S>::value&&
-                              std::is_same<typename S::char_type, Char>::value)>
-  FMT_ALWAYS_INLINE basic_fstring(const S&) : str_(S()) {
-    FMT_CONSTEXPR auto sv = basic_string_view<Char>(S());
-    FMT_CONSTEXPR int ignore =
-        (parse_format_string(sv, checker(sv, arg_pack())), 0);
-    detail::ignore_unused(ignore);
-  }
-  basic_fstring(runtime_format_string<Char> fmt) : str_(fmt.str) {}
-
-  operator basic_string_view<Char>() const { return str_; }
-  auto get() const -> basic_string_view<Char> { return str_; }
-};
-
-template <typename Char, typename... T>
-using basic_format_string = basic_fstring<Char, T...>;
-
-template <typename... T>
-using wformat_string = typename basic_format_string<wchar_t, T...>::t;
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename... Args> using wformat_string = wstring_view;
+inline auto runtime(wstring_view s) -> wstring_view { return s; }
+#else
+template <typename... Args>
+using wformat_string = basic_format_string<wchar_t, type_identity_t<Args>...>;
 inline auto runtime(wstring_view s) -> runtime_format_string<wchar_t> {
   return {{s}};
 }
+#endif
 
 template <> struct is_char<wchar_t> : std::true_type {};
+template <> struct is_char<detail::char8_type> : std::true_type {};
 template <> struct is_char<char16_t> : std::true_type {};
 template <> struct is_char<char32_t> : std::true_type {};
 
-#ifdef __cpp_char8_t
-template <> struct is_char<char8_t> : bool_constant<detail::is_utf8_enabled> {};
-#endif
-
 template <typename... T>
-constexpr auto make_wformat_args(T&... args)
-    -> decltype(fmt::make_format_args<wformat_context>(args...)) {
-  return fmt::make_format_args<wformat_context>(args...);
+constexpr auto make_wformat_args(const T&... args)
+    -> format_arg_store<wformat_context, T...> {
+  return {args...};
 }
 
-#if !FMT_USE_NONTYPE_TEMPLATE_ARGS
 inline namespace literals {
-inline auto operator""_a(const wchar_t* s, size_t) -> detail::udl_arg<wchar_t> {
+#if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_ARGS
+constexpr auto operator""_a(const wchar_t* s, size_t)
+    -> detail::udl_arg<wchar_t> {
   return {s};
 }
-}  // namespace literals
 #endif
+}  // namespace literals
 
 template <typename It, typename Sentinel>
 auto join(It begin, Sentinel end, wstring_view sep)
@@ -140,9 +84,9 @@ auto join(It begin, Sentinel end, wstring_view sep)
   return {begin, end, sep};
 }
 
-template <typename Range, FMT_ENABLE_IF(!is_tuple_like<Range>::value)>
+template <typename Range>
 auto join(Range&& range, wstring_view sep)
-    -> join_view<decltype(std::begin(range)), decltype(std::end(range)),
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>,
                  wchar_t> {
   return join(std::begin(range), std::end(range), sep);
 }
@@ -153,19 +97,19 @@ auto join(std::initializer_list<T> list, wstring_view sep)
   return join(std::begin(list), std::end(list), sep);
 }
 
-template <typename Tuple, FMT_ENABLE_IF(is_tuple_like<Tuple>::value)>
-auto join(const Tuple& tuple, basic_string_view<wchar_t> sep)
-    -> tuple_join_view<wchar_t, Tuple> {
+template <typename... T>
+auto join(const std::tuple<T...>& tuple, basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
   return {tuple, sep};
 }
 
 template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
-auto vformat(basic_string_view<Char> fmt,
-             typename detail::vformat_args<Char>::type args)
+auto vformat(basic_string_view<Char> format_str,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
     -> std::basic_string<Char> {
   auto buf = basic_memory_buffer<Char>();
-  detail::vformat_to(buf, fmt, args);
-  return {buf.data(), buf.size()};
+  detail::vformat_to(buf, format_str, args);
+  return to_string(buf);
 }
 
 template <typename... T>
@@ -173,122 +117,110 @@ auto format(wformat_string<T...> fmt, T&&... args) -> std::wstring {
   return vformat(fmt::wstring_view(fmt), fmt::make_wformat_args(args...));
 }
 
-template <typename OutputIt, typename... T>
-auto format_to(OutputIt out, wformat_string<T...> fmt, T&&... args)
-    -> OutputIt {
-  return vformat_to(out, fmt::wstring_view(fmt),
-                    fmt::make_wformat_args(args...));
-}
-
 // Pass char_t as a default template parameter instead of using
 // std::basic_string<char_t<S>> to reduce the symbol size.
-template <typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
+template <typename S, typename... T, typename Char = char_t<S>,
           FMT_ENABLE_IF(!std::is_same<Char, char>::value &&
                         !std::is_same<Char, wchar_t>::value)>
-auto format(const S& fmt, T&&... args) -> std::basic_string<Char> {
-  return vformat(detail::to_string_view(fmt),
-                 fmt::make_format_args<buffered_context<Char>>(args...));
+auto format(const S& format_str, T&&... args) -> std::basic_string<Char> {
+  return vformat(detail::to_string_view(format_str),
+                 fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
-template <typename Locale, typename S,
-          typename Char = detail::format_string_char_t<S>,
+template <typename Locale, typename S, typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto vformat(const Locale& loc, const S& fmt,
-                    typename detail::vformat_args<Char>::type args)
+inline auto vformat(
+    const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
     -> std::basic_string<Char> {
-  auto buf = basic_memory_buffer<Char>();
-  detail::vformat_to(buf, detail::to_string_view(fmt), args,
-                     detail::locale_ref(loc));
-  return {buf.data(), buf.size()};
+  return detail::vformat(loc, detail::to_string_view(format_str), args);
 }
 
-template <typename Locale, typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
+template <typename Locale, typename S, typename... T, typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto format(const Locale& loc, const S& fmt, T&&... args)
+inline auto format(const Locale& loc, const S& format_str, T&&... args)
     -> std::basic_string<Char> {
-  return vformat(loc, detail::to_string_view(fmt),
-                 fmt::make_format_args<buffered_context<Char>>(args...));
+  return detail::vformat(loc, detail::to_string_view(format_str),
+                         fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
-template <typename OutputIt, typename S,
-          typename Char = detail::format_string_char_t<S>,
+template <typename OutputIt, typename S, typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
-auto vformat_to(OutputIt out, const S& fmt,
-                typename detail::vformat_args<Char>::type args) -> OutputIt {
+auto vformat_to(OutputIt out, const S& format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
   auto&& buf = detail::get_buffer<Char>(out);
-  detail::vformat_to(buf, detail::to_string_view(fmt), args);
+  detail::vformat_to(buf, detail::to_string_view(format_str), args);
   return detail::get_iterator(buf, out);
 }
 
 template <typename OutputIt, typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value &&
-                        !std::is_same<Char, char>::value &&
-                        !std::is_same<Char, wchar_t>::value)>
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
 inline auto format_to(OutputIt out, const S& fmt, T&&... args) -> OutputIt {
   return vformat_to(out, detail::to_string_view(fmt),
-                    fmt::make_format_args<buffered_context<Char>>(args...));
+                    fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
 template <typename Locale, typename S, typename OutputIt, typename... Args,
-          typename Char = detail::format_string_char_t<S>,
+          typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_locale<Locale>::value&&
                                 detail::is_exotic_char<Char>::value)>
-inline auto vformat_to(OutputIt out, const Locale& loc, const S& fmt,
-                       typename detail::vformat_args<Char>::type args)
-    -> OutputIt {
+inline auto vformat_to(
+    OutputIt out, const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) -> OutputIt {
   auto&& buf = detail::get_buffer<Char>(out);
-  vformat_to(buf, detail::to_string_view(fmt), args, detail::locale_ref(loc));
+  vformat_to(buf, detail::to_string_view(format_str), args,
+             detail::locale_ref(loc));
   return detail::get_iterator(buf, out);
 }
 
-template <typename Locale, typename OutputIt, typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
+template <typename OutputIt, typename Locale, typename S, typename... T,
+          typename Char = char_t<S>,
           bool enable = detail::is_output_iterator<OutputIt, Char>::value &&
                         detail::is_locale<Locale>::value &&
                         detail::is_exotic_char<Char>::value>
-inline auto format_to(OutputIt out, const Locale& loc, const S& fmt,
+inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
                       T&&... args) ->
     typename std::enable_if<enable, OutputIt>::type {
-  return vformat_to(out, loc, detail::to_string_view(fmt),
-                    fmt::make_format_args<buffered_context<Char>>(args...));
+  return vformat_to(out, loc, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
 template <typename OutputIt, typename Char, typename... Args,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto vformat_to_n(OutputIt out, size_t n, basic_string_view<Char> fmt,
-                         typename detail::vformat_args<Char>::type args)
+inline auto vformat_to_n(
+    OutputIt out, size_t n, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
     -> format_to_n_result<OutputIt> {
   using traits = detail::fixed_buffer_traits;
   auto buf = detail::iterator_buffer<OutputIt, Char, traits>(out, n);
-  detail::vformat_to(buf, fmt, args);
+  detail::vformat_to(buf, format_str, args);
   return {buf.out(), buf.count()};
 }
 
 template <typename OutputIt, typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
+          typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
 inline auto format_to_n(OutputIt out, size_t n, const S& fmt, T&&... args)
     -> format_to_n_result<OutputIt> {
-  return vformat_to_n(out, n, fmt::basic_string_view<Char>(fmt),
-                      fmt::make_format_args<buffered_context<Char>>(args...));
+  return vformat_to_n(out, n, detail::to_string_view(fmt),
+                      fmt::make_format_args<buffer_context<Char>>(args...));
 }
 
-template <typename S, typename... T,
-          typename Char = detail::format_string_char_t<S>,
+template <typename S, typename... T, typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_exotic_char<Char>::value)>
 inline auto formatted_size(const S& fmt, T&&... args) -> size_t {
   auto buf = detail::counting_buffer<Char>();
   detail::vformat_to(buf, detail::to_string_view(fmt),
-                     fmt::make_format_args<buffered_context<Char>>(args...));
+                     fmt::make_format_args<buffer_context<Char>>(args...));
   return buf.count();
 }
 
@@ -322,48 +254,9 @@ template <typename... T> void println(wformat_string<T...> fmt, T&&... args) {
   return print(L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
 }
 
-inline auto vformat(const text_style& ts, wstring_view fmt, wformat_args args)
-    -> std::wstring {
-  auto buf = wmemory_buffer();
-  detail::vformat_to(buf, ts, fmt, args);
-  return {buf.data(), buf.size()};
-}
-
-template <typename... T>
-inline auto format(const text_style& ts, wformat_string<T...> fmt, T&&... args)
-    -> std::wstring {
-  return fmt::vformat(ts, fmt, fmt::make_wformat_args(args...));
-}
-
-template <typename... T>
-FMT_DEPRECATED void print(std::FILE* f, const text_style& ts,
-                          wformat_string<T...> fmt, const T&... args) {
-  vprint(f, ts, fmt, fmt::make_wformat_args(args...));
-}
-
-template <typename... T>
-FMT_DEPRECATED void print(const text_style& ts, wformat_string<T...> fmt,
-                          const T&... args) {
-  return print(stdout, ts, fmt, args...);
-}
-
-inline void vprint(std::wostream& os, wstring_view fmt, wformat_args args) {
-  auto buffer = basic_memory_buffer<wchar_t>();
-  detail::vformat_to(buffer, fmt, args);
-  detail::write_buffer(os, buffer);
-}
-
-template <typename... T>
-void print(std::wostream& os, wformat_string<T...> fmt, T&&... args) {
-  vprint(os, fmt, fmt::make_format_args<buffered_context<wchar_t>>(args...));
-}
-
-template <typename... T>
-void println(std::wostream& os, wformat_string<T...> fmt, T&&... args) {
-  print(os, L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
-}
-
-/// Converts `value` to `std::wstring` using the default format for type `T`.
+/**
+  Converts *value* to ``std::wstring`` using the default format for type *T*.
+ */
 template <typename T> inline auto to_wstring(const T& value) -> std::wstring {
   return format(FMT_STRING(L"{}"), value);
 }
diff --git a/src/fmtlib_format.cpp b/src/fmtlib_format.cpp
index 966b6dce84..fd51cb6468 100644
--- a/src/fmtlib_format.cpp
+++ b/src/fmtlib_format.cpp
@@ -16,8 +16,7 @@ template FMT_API auto dragonbox::to_decimal(float x) noexcept
 template FMT_API auto dragonbox::to_decimal(double x) noexcept
     -> dragonbox::decimal_fp<double>;
 
-#if FMT_USE_LOCALE
-// DEPRECATED! locale_ref in the detail namespace
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 template FMT_API locale_ref::locale_ref(const std::locale& loc);
 template FMT_API auto locale_ref::get<std::locale>() const -> std::locale;
 #endif
@@ -28,10 +27,8 @@ template FMT_API auto thousands_sep_impl(locale_ref)
     -> thousands_sep_result<char>;
 template FMT_API auto decimal_point_impl(locale_ref) -> char;
 
-// DEPRECATED!
 template FMT_API void buffer<char>::append(const char*, const char*);
 
-// DEPRECATED!
 template FMT_API void vformat_to(buffer<char>&, string_view,
                                  typename vformat_args<>::type, locale_ref);
 
diff --git a/src/fmtlib_os.cpp b/src/fmtlib_os.cpp
index 2bb9e94551..3338d13cae 100644
--- a/src/fmtlib_os.cpp
+++ b/src/fmtlib_os.cpp
@@ -13,51 +13,47 @@
 
 #include "fmt/os.h"
 
-#ifndef FMT_MODULE
-#  include <climits>
+#include <climits>
 
-#  if FMT_USE_FCNTL
-#    include <sys/stat.h>
-#    include <sys/types.h>
+#if FMT_USE_FCNTL
+#  include <sys/stat.h>
+#  include <sys/types.h>
 
-#    ifdef _WRS_KERNEL    // VxWorks7 kernel
-#      include <ioLib.h>  // getpagesize
-#    endif
-
-#    ifndef _WIN32
-#      include <unistd.h>
-#    else
-#      ifndef WIN32_LEAN_AND_MEAN
-#        define WIN32_LEAN_AND_MEAN
-#      endif
-#      include <io.h>
-#    endif  // _WIN32
-#  endif    // FMT_USE_FCNTL
-
-#  ifdef _WIN32
-#    include <windows.h>
+#  ifdef _WRS_KERNEL    // VxWorks7 kernel
+#    include <ioLib.h>  // getpagesize
 #  endif
-#endif
+
+#  ifndef _WIN32
+#    include <unistd.h>
+#  else
+#    ifndef WIN32_LEAN_AND_MEAN
+#      define WIN32_LEAN_AND_MEAN
+#    endif
+#    include <io.h>
+
+#    ifndef S_IRUSR
+#      define S_IRUSR _S_IREAD
+#    endif
+#    ifndef S_IWUSR
+#      define S_IWUSR _S_IWRITE
+#    endif
+#    ifndef S_IRGRP
+#      define S_IRGRP 0
+#    endif
+#    ifndef S_IWGRP
+#      define S_IWGRP 0
+#    endif
+#    ifndef S_IROTH
+#      define S_IROTH 0
+#    endif
+#    ifndef S_IWOTH
+#      define S_IWOTH 0
+#    endif
+#  endif  // _WIN32
+#endif    // FMT_USE_FCNTL
 
 #ifdef _WIN32
-#  ifndef S_IRUSR
-#    define S_IRUSR _S_IREAD
-#  endif
-#  ifndef S_IWUSR
-#    define S_IWUSR _S_IWRITE
-#  endif
-#  ifndef S_IRGRP
-#    define S_IRGRP 0
-#  endif
-#  ifndef S_IWGRP
-#    define S_IWGRP 0
-#  endif
-#  ifndef S_IROTH
-#    define S_IROTH 0
-#  endif
-#  ifndef S_IWOTH
-#    define S_IWOTH 0
-#  endif
+#  include <windows.h>
 #endif
 
 namespace {
@@ -161,7 +157,7 @@ void detail::format_windows_error(detail::buffer<char>& out, int error_code,
 }
 
 void report_windows_error(int error_code, const char* message) noexcept {
-  do_report_error(detail::format_windows_error, error_code, message);
+  report_error(detail::format_windows_error, error_code, message);
 }
 #endif  // _WIN32
 
@@ -187,14 +183,12 @@ void buffered_file::close() {
 }
 
 int buffered_file::descriptor() const {
-#ifdef FMT_HAS_SYSTEM
-  // fileno is a macro on OpenBSD.
-#  ifdef fileno
-#    undef fileno
-#  endif
+#if !defined(fileno)
   int fd = FMT_POSIX_CALL(fileno(file_));
-#elif defined(_WIN32)
-  int fd = _fileno(file_);
+#elif defined(FMT_HAS_SYSTEM)
+  // fileno is a macro on OpenBSD so we cannot use FMT_POSIX_CALL.
+#  define FMT_DISABLE_MACRO
+  int fd = FMT_SYSTEM(fileno FMT_DISABLE_MACRO(file_));
 #else
   int fd = fileno(file_);
 #endif
@@ -375,25 +369,30 @@ long getpagesize() {
 }
 #  endif
 
-void ostream::grow(buffer<char>& buf, size_t) {
-  if (buf.size() == buf.capacity()) static_cast<ostream&>(buf).flush();
+namespace detail {
+
+void file_buffer::grow(buffer<char>& buf, size_t) {
+  if (buf.size() == buf.capacity()) static_cast<file_buffer&>(buf).flush();
 }
 
-ostream::ostream(cstring_view path, const detail::ostream_params& params)
+file_buffer::file_buffer(cstring_view path, const ostream_params& params)
     : buffer<char>(grow), file_(path, params.oflag) {
   set(new char[params.buffer_size], params.buffer_size);
 }
 
-ostream::ostream(ostream&& other) noexcept
+file_buffer::file_buffer(file_buffer&& other)
     : buffer<char>(grow, other.data(), other.size(), other.capacity()),
       file_(std::move(other.file_)) {
   other.clear();
   other.set(nullptr, 0);
 }
 
-ostream::~ostream() {
+file_buffer::~file_buffer() {
   flush();
   delete[] data();
 }
+}  // namespace detail
+
+ostream::~ostream() = default;
 #endif  // FMT_USE_FCNTL
 FMT_END_NAMESPACE
diff --git a/src/info.cpp b/src/info.cpp
index bf98f77b58..17b1f417ea 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -270,25 +270,25 @@ void Info::command(int narg, char **arg)
 
   fputs("\nInfo-Info-Info-Info-Info-Info-Info-Info-Info-Info-Info\n",out);
   std::tm now = fmt::localtime(std::time(nullptr));
-  fmt::print(out,"Printed on {}", std::asctime(&now));
+  utils::print(out,"Printed on {}", std::asctime(&now));
 
   if (flags & CONFIG) {
-    fmt::print(out,"\nLAMMPS version: {} / {}\n", lmp->version, lmp->num_ver);
+    utils::print(out,"\nLAMMPS version: {} / {}\n", lmp->version, lmp->num_ver);
 
     if (LAMMPS::has_git_info())
-      fmt::print(out,"Git info: {} / {} / {}\n",
+      utils::print(out,"Git info: {} / {} / {}\n",
                  LAMMPS::git_branch(), LAMMPS::git_descriptor(),LAMMPS::git_commit());
 
-    fmt::print(out,"\nOS information: {}\n\n",platform::os_info());
+    utils::print(out,"\nOS information: {}\n\n",platform::os_info());
 
-    fmt::print(out,"sizeof(smallint): {}-bit\n"
+    utils::print(out,"sizeof(smallint): {}-bit\n"
                "sizeof(imageint): {}-bit\n"
                "sizeof(tagint):   {}-bit\n"
                "sizeof(bigint):   {}-bit\n",
                sizeof(smallint)*8, sizeof(imageint)*8,
                sizeof(tagint)*8, sizeof(bigint)*8);
 
-    fmt::print(out,"\nCompiler: {} with {}\nC++ standard: {}\n",
+    utils::print(out,"\nCompiler: {} with {}\nC++ standard: {}\n",
                platform::compiler_info(),platform::openmp_standard(),platform::cxx_standard());
     fputs(get_fmt_info().c_str(), out);
 
@@ -307,7 +307,7 @@ void Info::command(int narg, char **arg)
 #else // defined(LAMMPS_SMALLSMALL)
     fputs("-DLAMMPS_SMALLSMALL\n",out);
 #endif
-    if (has_gzip_support()) fmt::print(out,"\n{}\n",platform::compress_info());
+    if (has_gzip_support()) utils::print(out,"\n{}\n",platform::compress_info());
 
     int ncword, ncline = 0;
     fputs("\nInstalled packages:\n\n",out);
@@ -317,17 +317,17 @@ void Info::command(int narg, char **arg)
         ncline = 0;
         fputs("\n",out);
       }
-      fmt::print(out,"{} ",*pkg);
+      utils::print(out,"{} ",*pkg);
       ncline += ncword + 1;
     }
     fputs("\n",out);
   }
 
   if (flags & ACCELERATOR) {
-    fmt::print(out,"\nAccelerator configuration:\n\n{}",
+    utils::print(out,"\nAccelerator configuration:\n\n{}",
                get_accelerator_info());
     if (Info::has_gpu_device())
-      fmt::print(out,"\nAvailable GPU devices:\n{}\n",get_gpu_device_info());
+      utils::print(out,"\nAvailable GPU devices:\n{}\n",get_gpu_device_info());
   }
 
   if (flags & MEMORY) {
@@ -336,18 +336,18 @@ void Info::command(int narg, char **arg)
     get_memory_info(meminfo);
 
     fputs("\nMemory allocation information (MPI rank 0):\n\n",out);
-    fmt::print(out,"Total dynamically allocated memory: {:.4} Mbyte\n",
+    utils::print(out,"Total dynamically allocated memory: {:.4} Mbyte\n",
                meminfo[0]);
 
 #if defined(_WIN32)
-    fmt::print(out,"Non-shared memory use: {:.4} Mbyte\n",meminfo[1]);
-    fmt::print(out,"Maximum working set size: {:.4} Mbyte\n",meminfo[2]);
+    utils::print(out,"Non-shared memory use: {:.4} Mbyte\n",meminfo[1]);
+    utils::print(out,"Maximum working set size: {:.4} Mbyte\n",meminfo[2]);
 #else
 #if defined(__linux__)
-    fmt::print(out,"Current reserved memory pool size: {:.4} Mbyte\n",
+    utils::print(out,"Current reserved memory pool size: {:.4} Mbyte\n",
                meminfo[1]);
 #endif
-    fmt::print(out,"Maximum resident set size: {:.4} Mbyte\n",meminfo[2]);
+    utils::print(out,"Maximum resident set size: {:.4} Mbyte\n",meminfo[2]);
 #endif
   }
 
@@ -355,18 +355,18 @@ void Info::command(int narg, char **arg)
     int major,minor;
     std::string version = platform::mpi_info(major,minor);
 
-    fmt::print(out,"\nCommunication information:\n"
+    utils::print(out,"\nCommunication information:\n"
                "MPI library level: MPI v{}.{}\n"
                "MPI version: {}\n",major,minor,version);
 
-    fmt::print(out,"Comm style = {},  Comm layout = {}\n"
+    utils::print(out,"Comm style = {},  Comm layout = {}\n"
                "Communicate velocities for ghost atoms = {}\n",
                commstyles[comm->style], commlayout[comm->layout],
                comm->ghost_velocity ? "yes" : "no");
 
     if (domain->box_exist) {
       if (comm->mode == 0)
-        fmt::print(out,"Communication mode = single\n"
+        utils::print(out,"Communication mode = single\n"
                    "Communication cutoff = {}\n",
                    comm->get_comm_cutoff());
 
@@ -381,7 +381,7 @@ void Info::command(int narg, char **arg)
           }
 
           if (comm->cutusermulti) cut = MAX(cut,comm->cutusermulti[i]);
-          fmt::print(out,"Communication cutoff for collection {} = {:.8}\n", i, cut);
+          utils::print(out,"Communication cutoff for collection {} = {:.8}\n", i, cut);
         }
       }
 
@@ -391,13 +391,13 @@ void Info::command(int narg, char **arg)
         for (int i=1; i <= atom->ntypes && neighbor->cuttype; ++i) {
           cut = neighbor->cuttype[i];
           if (comm->cutusermultiold) cut = MAX(cut,comm->cutusermultiold[i]);
-          fmt::print(out,"Communication cutoff for type {} = {:.8}\n", i, cut);
+          utils::print(out,"Communication cutoff for type {} = {:.8}\n", i, cut);
         }
       }
     }
-    fmt::print(out,"Nprocs = {},   Nthreads = {}\n",comm->nprocs,comm->nthreads);
+    utils::print(out,"Nprocs = {},   Nthreads = {}\n",comm->nprocs,comm->nthreads);
     if (domain->box_exist)
-      fmt::print(out,"Processor grid = {} x {} x {}\n",comm->procgrid[0],
+      utils::print(out,"Processor grid = {} x {} x {}\n",comm->procgrid[0],
                  comm->procgrid[1], comm->procgrid[2]);
   }
 
@@ -408,80 +408,80 @@ void Info::command(int narg, char **arg)
 
   if (flags & SYSTEM) {
     fputs("\nSystem information:\n",out);
-    fmt::print(out,"Units         = {}\n", update->unit_style);
-    fmt::print(out,"Atom style    = {}\n", atom->get_style());
-    fmt::print(out,"Atom map      = {}\n", mapstyles[atom->map_style]);
+    utils::print(out,"Units         = {}\n", update->unit_style);
+    utils::print(out,"Atom style    = {}\n", atom->get_style());
+    utils::print(out,"Atom map      = {}\n", mapstyles[atom->map_style]);
     if (atom->molecular != Atom::ATOMIC) {
       const char *msg;
       msg = (atom->molecular == Atom::TEMPLATE) ? "template" : "standard";
-      fmt::print(out,"Molecule type = {}\n",msg);
+      utils::print(out,"Molecule type = {}\n",msg);
     }
-    fmt::print(out,"Atoms     = {:12},  types = {:8d},  style = {}\n",
+    utils::print(out,"Atoms     = {:12},  types = {:8d},  style = {}\n",
                atom->natoms, atom->ntypes, force->pair_style);
 
-    if (atom->tag_enable) fmt::print(out,"Atoms with atom IDs\n");
-    if (atom->molecule) fmt::print(out,"Atoms with molecule IDs\n");
-    if (atom->mass) fmt::print(out,"Atoms with per-type masses\n");
-    if (atom->rmass) fmt::print(out,"Atoms with per-atom masses\n");
-    if (atom->q) fmt::print(out,"Atoms with per-atom charges\n");
+    if (atom->tag_enable) utils::print(out,"Atoms with atom IDs\n");
+    if (atom->molecule) utils::print(out,"Atoms with molecule IDs\n");
+    if (atom->mass) utils::print(out,"Atoms with per-type masses\n");
+    if (atom->rmass) utils::print(out,"Atoms with per-atom masses\n");
+    if (atom->q) utils::print(out,"Atoms with per-atom charges\n");
 
     if (force->pair && utils::strmatch(force->pair_style,"^hybrid")) {
       auto hybrid = dynamic_cast<PairHybrid *>(force->pair);
-      fmt::print(out,"Hybrid sub-styles:");
+      utils::print(out,"Hybrid sub-styles:");
       for (int i=0; i < hybrid->nstyles; ++i)
-        fmt::print(out," {}", hybrid->keywords[i]);
+        utils::print(out," {}", hybrid->keywords[i]);
       fputc('\n',out);
     }
     if (atom->molecular != Atom::ATOMIC) {
       const char *msg;
       msg = force->bond_style ? force->bond_style : "none";
-      fmt::print(out,"Bonds     = {:12},  types = {:8},  style = {}\n",
+      utils::print(out,"Bonds     = {:12},  types = {:8},  style = {}\n",
                  atom->nbonds, atom->nbondtypes, msg);
 
       msg = force->angle_style ? force->angle_style : "none";
-      fmt::print(out,"Angles    = {:12},  types = {:8},  style = {}\n",
+      utils::print(out,"Angles    = {:12},  types = {:8},  style = {}\n",
                  atom->nangles, atom->nangletypes, msg);
 
       msg = force->dihedral_style ? force->dihedral_style : "none";
-      fmt::print(out,"Dihedrals = {:12},  types = {:8},  style = {}\n",
+      utils::print(out,"Dihedrals = {:12},  types = {:8},  style = {}\n",
                  atom->ndihedrals, atom->ndihedraltypes, msg);
 
       msg = force->improper_style ? force->improper_style : "none";
-      fmt::print(out,"Impropers = {:12},  types = {:8},  style = {}\n",
+      utils::print(out,"Impropers = {:12},  types = {:8},  style = {}\n",
                  atom->nimpropers, atom->nimpropertypes, msg);
 
       const double * const special_lj   = force->special_lj;
       const double * const special_coul = force->special_coul;
 
-      fmt::print(out,"Special bond factors lj =    {:<8} {:<8} {:<8}\n"
+      utils::print(out,"Special bond factors lj =    {:<8} {:<8} {:<8}\n"
                  "Special bond factors coul =  {:<8} {:<8} {:<8}\n",
                  special_lj[1],special_lj[2],special_lj[3],
                  special_coul[1],special_coul[2],special_coul[3]);
     }
 
-    fmt::print(out,"Kspace style = {}\n",
+    utils::print(out,"Kspace style = {}\n",
                force->kspace ? force->kspace_style : "none");
 
     if (domain->box_exist) {
-      fmt::print(out,"\nDimensions = {}\n",domain->dimension);
-      fmt::print(out,"{} box = {:.8} x {:.8} x {:.8}\n",
+      utils::print(out,"\nDimensions = {}\n",domain->dimension);
+      utils::print(out,"{} box = {:.8} x {:.8} x {:.8}\n",
                  domain->triclinic ? "Triclinic" : "Orthogonal",
                  domain->xprd, domain->yprd, domain->zprd);
-      fmt::print(out,"Boundaries = {},{} {},{} {},{}\n",
+      utils::print(out,"Boundaries = {},{} {},{} {},{}\n",
                  bstyles[domain->boundary[0][0]],bstyles[domain->boundary[0][1]],
                  bstyles[domain->boundary[1][0]],bstyles[domain->boundary[1][1]],
                  bstyles[domain->boundary[2][0]],bstyles[domain->boundary[2][1]]);
-      fmt::print(out,"xlo, xhi = {:.8}, {:.8}\n", domain->boxlo[0], domain->boxhi[0]);
-      fmt::print(out,"ylo, yhi = {:.8}, {:.8}\n", domain->boxlo[1], domain->boxhi[1]);
-      fmt::print(out,"zlo, zhi = {:.8}, {:.8}\n", domain->boxlo[2], domain->boxhi[2]);
+      utils::print(out,"xlo, xhi = {:.8}, {:.8}\n", domain->boxlo[0], domain->boxhi[0]);
+      utils::print(out,"ylo, yhi = {:.8}, {:.8}\n", domain->boxlo[1], domain->boxhi[1]);
+      utils::print(out,"zlo, zhi = {:.8}, {:.8}\n", domain->boxlo[2], domain->boxhi[2]);
       if (domain->triclinic)
-        fmt::print(out,"Xy, xz, yz = {:.8}, {:.8}, {:.8}\n",
+        utils::print(out,"Xy, xz, yz = {:.8}, {:.8}, {:.8}\n",
                    domain->xy, domain->xz, domain->yz);
     } else {
       fputs("\nBox has not yet been created\n",out);
     }
-    fmt::print(out,"\nCurrent timestep number = {}\n", update->ntimestep);
-    fmt::print(out,"Current timestep size = {}\n", update->dt);
+    utils::print(out,"\nCurrent timestep number = {}\n", update->ntimestep);
+    utils::print(out,"Current timestep size = {}\n", update->dt);
   }
 
   if (domain->box_exist && (flags & COEFFS)) {
@@ -492,7 +492,7 @@ void Info::command(int narg, char **arg)
       fputs("\nPair Coeffs:\n",out);
       for (int i=1; i <= atom->ntypes; ++i)
         for (int j=i; j <= atom->ntypes; ++j) {
-          fmt::print(out,"{:6d} {:6d}:",i,j);
+          utils::print(out,"{:6d} {:6d}:",i,j);
           if (pair->allocated && pair->setflag[i][j]) fputs(" is set\n",out);
           else fputs(" is not set\n",out);
         }
@@ -503,7 +503,7 @@ void Info::command(int narg, char **arg)
       if (bond) {
         fputs("\nBond Coeffs:\n",out);
         for (int i=1; i <= atom->nbondtypes; ++i) {
-          fmt::print(out,"{:6d}:",i);
+          utils::print(out,"{:6d}:",i);
           if (bond->allocated && bond->setflag[i]) fputs(" is set\n",out);
           else fputs (" is not set\n",out);
         }
@@ -515,7 +515,7 @@ void Info::command(int narg, char **arg)
       if (angle) {
         fputs("\nAngle Coeffs:\n",out);
         for (int i=1; i <= atom->nangletypes; ++i) {
-          fmt::print(out,"{:6d}:",i);
+          utils::print(out,"{:6d}:",i);
           if (angle->allocated && angle->setflag[i]) fputs(" is set\n",out);
           else fputs (" is not set\n",out);
         }
@@ -527,7 +527,7 @@ void Info::command(int narg, char **arg)
       if (dihedral) {
         fputs("\nDihedral Coeffs:\n",out);
         for (int i=1; i <= atom->ndihedraltypes; ++i) {
-          fmt::print(out,"{:6d}:",i);
+          utils::print(out,"{:6d}:",i);
           if (dihedral->allocated && dihedral->setflag[i]) fputs(" is set\n",out);
           else fputs (" is not set\n",out);
         }
@@ -539,7 +539,7 @@ void Info::command(int narg, char **arg)
       if (b) {
         fputs("\nImproper Coeffs:\n",out);
         for (int i=1; i <= atom->nimpropertypes; ++i) {
-          fmt::print(out,"{:6d}:",i);
+          utils::print(out,"{:6d}:",i);
           if (b->allocated && b->setflag[i]) fputs(" is set\n",out);
           else fputs (" is not set\n",out);
         }
@@ -554,7 +554,7 @@ void Info::command(int narg, char **arg)
     fputs("\nGroup information:\n",out);
     for (int i=0; i < ngroup; ++i) {
       if (names[i])
-        fmt::print(out,"Group[{:2d}]:     {:16} ({})\n",
+        utils::print(out,"Group[{:2d}]:     {:16} ({})\n",
                    i, names[i], dynamic[i] ? "dynamic" : "static");
     }
   }
@@ -563,11 +563,11 @@ void Info::command(int narg, char **arg)
     fputs("\nRegion information:\n",out);
     int i=0;
     for (auto &reg : domain->get_region_list()) {
-      fmt::print(out,"Region[{:3d}]:  {:16}  style = {:16}  side = {}\n",
+      utils::print(out,"Region[{:3d}]:  {:16}  style = {:16}  side = {}\n",
                  i, std::string(reg->id)+',', std::string(reg->style)+',',
                  reg->interior ? "in" : "out");
       if (reg->bboxflag)
-        fmt::print(out,"   Boundary:  lo {:.8} {:.8} {:.8}  hi {:.8} {:.8} {:.8}\n",
+        utils::print(out,"   Boundary:  lo {:.8} {:.8} {:.8}  hi {:.8} {:.8} {:.8}\n",
                    reg->extent_xlo, reg->extent_ylo,
                    reg->extent_zlo, reg->extent_xhi,
                    reg->extent_yhi, reg->extent_zhi);
@@ -580,7 +580,7 @@ void Info::command(int narg, char **arg)
     char **names = group->names;
     fputs("\nCompute information:\n",out);
     for (const auto &compute : modify->get_compute_list())
-      fmt::print(out,"Compute[{:3d}]:  {:16}  style = {:16}  group = {}\n", i++,
+      utils::print(out,"Compute[{:3d}]:  {:16}  style = {:16}  group = {}\n", i++,
                  std::string(compute->id)+',',std::string(compute->style)+',',
                  names[compute->igroup]);
   }
@@ -593,13 +593,13 @@ void Info::command(int narg, char **arg)
     char **names = group->names;
     fputs("\nDump information:\n",out);
     for (int i=0; i < ndump; ++i) {
-      fmt::print(out,"Dump[{:3d}]:     {:16}  file = {:16}  style = {:16}  group = {:16}  ",
+      utils::print(out,"Dump[{:3d}]:     {:16}  file = {:16}  style = {:16}  group = {:16}  ",
                  i, std::string(dump[i]->id)+',',std::string(dump[i]->filename)+',',
                  std::string(dump[i]->style)+',',std::string(names[dump[i]->igroup])+',');
       if (nevery[i]) {
-        fmt::print(out,"every = {}\n", nevery[i]);
+        utils::print(out,"every = {}\n", nevery[i]);
       } else {
-        fmt::print(out,"every = {}\n", vnames[i]);
+        utils::print(out,"every = {}\n", vnames[i]);
       }
     }
   }
@@ -609,7 +609,7 @@ void Info::command(int narg, char **arg)
     char **names = group->names;
     fputs("\nFix information:\n",out);
     for (const auto &fix : modify->get_fix_list())
-      fmt::print(out, "Fix[{:3d}]:      {:16}  style = {:16}  group = {}\n",i++,
+      utils::print(out, "Fix[{:3d}]:      {:16}  style = {:16}  group = {}\n",i++,
                  std::string(fix->id)+',',std::string(fix->style)+',',names[fix->igroup]);
   }
 
@@ -618,7 +618,7 @@ void Info::command(int narg, char **arg)
     fputs("\nVariable information:\n",out);
     for (int i=0; i < nvar; ++i) {
       auto vinfo = get_variable_info(i);
-      fmt::print(out, get_variable_info(i));
+      utils::print(out, get_variable_info(i));
     }
   }
 
@@ -635,7 +635,7 @@ void Info::command(int narg, char **arg)
     wallclock = (wallclock - walls) / 60.0;
     wallm = fmod(wallclock,60.0);
     wallh = (wallclock - wallm) / 60.0;
-    fmt::print(out,"\nTotal time information (MPI rank 0):\n"
+    utils::print(out,"\nTotal time information (MPI rank 0):\n"
                "  CPU time: {:4d}:{:02d}:{:02d}\n"
                " Wall time: {:4d}:{:02d}:{:02d}\n",
                cpuh,cpum,cpus,wallh,wallm,walls);
diff --git a/src/input.h b/src/input.h
index fe6cf15407..728c224835 100644
--- a/src/input.h
+++ b/src/input.h
@@ -26,8 +26,10 @@ class Input : protected Pointers {
   friend class Error;
   friend class Deprecated;
   friend class SimpleCommandsTest_Echo_Test;
+  friend std::string utils::point_to_error(Input *input, int failed);
 
  public:
+  char *command;               // ptr to current command
   int narg;                    // # of command args
   char **arg;                  // parsed args for command
   class Variable *variable;    // defined variables
@@ -42,7 +44,6 @@ class Input : protected Pointers {
   int get_jump_skip() const { return jump_skip; }
 
  protected:
-  char *command;      // ptr to current command
   int echo_screen;    // 0 = no, 1 = yes
   int echo_log;       // 0 = no, 1 = yes
 
diff --git a/src/label_map.cpp b/src/label_map.cpp
index 9934868c49..24cef51062 100644
--- a/src/label_map.cpp
+++ b/src/label_map.cpp
@@ -335,28 +335,28 @@ bool LabelMap::is_complete(int mode) const
 void LabelMap::write_data(FILE *fp)
 {
   if (is_complete(Atom::ATOM)) {
-    fmt::print(fp, "\nAtom Type Labels\n\n");
-    for (int i = 0; i < natomtypes; i++) fmt::print(fp, "{} {}\n", i + 1, typelabel[i]);
+    utils::print(fp, "\nAtom Type Labels\n\n");
+    for (int i = 0; i < natomtypes; i++) utils::print(fp, "{} {}\n", i + 1, typelabel[i]);
   }
 
   if (force->bond && is_complete(Atom::BOND)) {
-    fmt::print(fp, "\nBond Type Labels\n\n");
-    for (int i = 0; i < nbondtypes; i++) fmt::print(fp, "{} {}\n", i + 1, btypelabel[i]);
+    utils::print(fp, "\nBond Type Labels\n\n");
+    for (int i = 0; i < nbondtypes; i++) utils::print(fp, "{} {}\n", i + 1, btypelabel[i]);
   }
 
   if (force->angle && is_complete(Atom::ANGLE)) {
-    fmt::print(fp, "\nAngle Type Labels\n\n");
-    for (int i = 0; i < nangletypes; i++) fmt::print(fp, "{} {}\n", i + 1, atypelabel[i]);
+    utils::print(fp, "\nAngle Type Labels\n\n");
+    for (int i = 0; i < nangletypes; i++) utils::print(fp, "{} {}\n", i + 1, atypelabel[i]);
   }
 
   if (force->dihedral && is_complete(Atom::DIHEDRAL)) {
-    fmt::print(fp, "\nDihedral Type Labels\n\n");
-    for (int i = 0; i < ndihedraltypes; i++) fmt::print(fp, "{} {}\n", i + 1, dtypelabel[i]);
+    utils::print(fp, "\nDihedral Type Labels\n\n");
+    for (int i = 0; i < ndihedraltypes; i++) utils::print(fp, "{} {}\n", i + 1, dtypelabel[i]);
   }
 
   if (force->improper && is_complete(Atom::IMPROPER)) {
-    fmt::print(fp, "\nImproper Type Labels\n\n");
-    for (int i = 0; i < nimpropertypes; i++) fmt::print(fp, "{} {}\n", i + 1, itypelabel[i]);
+    utils::print(fp, "\nImproper Type Labels\n\n");
+    for (int i = 0; i < nimpropertypes; i++) utils::print(fp, "{} {}\n", i + 1, itypelabel[i]);
   }
 }
 
@@ -473,31 +473,31 @@ void LabelMap::write_map(const std::string &filename)
     if (typelabel_map.size() > 0) {
       fputs("labelmap atom", fp);
       for (int i = 0; i < natomtypes; ++i)
-        if (!typelabel[i].empty()) fmt::print(fp, " {} \"\"\" {} \"\"\"", i + 1, typelabel[i]);
+        if (!typelabel[i].empty()) utils::print(fp, " {} \"\"\" {} \"\"\"", i + 1, typelabel[i]);
       fputc('\n', fp);
     }
     if (btypelabel_map.size() > 0) {
       fputs("labelmap bond", fp);
       for (int i = 0; i < nbondtypes; ++i)
-        if (!btypelabel[i].empty()) fmt::print(fp, " {} \"\"\" {} \"\"\"", i + 1, btypelabel[i]);
+        if (!btypelabel[i].empty()) utils::print(fp, " {} \"\"\" {} \"\"\"", i + 1, btypelabel[i]);
       fputc('\n', fp);
     }
     if (atypelabel_map.size() > 0) {
       fputs("labelmap angle", fp);
       for (int i = 0; i < nangletypes; ++i)
-        if (!atypelabel[i].empty()) fmt::print(fp, " {} \"\"\" {} \"\"\"", i + 1, atypelabel[i]);
+        if (!atypelabel[i].empty()) utils::print(fp, " {} \"\"\" {} \"\"\"", i + 1, atypelabel[i]);
       fputc('\n', fp);
     }
     if (dtypelabel_map.size() > 0) {
       fputs("labelmap dihedral", fp);
       for (int i = 0; i < ndihedraltypes; ++i)
-        if (!dtypelabel[i].empty()) fmt::print(fp, " {} \"\"\" {} \"\"\"", i + 1, dtypelabel[i]);
+        if (!dtypelabel[i].empty()) utils::print(fp, " {} \"\"\" {} \"\"\"", i + 1, dtypelabel[i]);
       fputc('\n', fp);
     }
     if (itypelabel_map.size() > 0) {
       fputs("labelmap improper", fp);
       for (int i = 0; i < nimpropertypes; ++i)
-        if (!itypelabel[i].empty()) fmt::print(fp, " {} \"\"\" {} \"\"\"", i + 1, itypelabel[i]);
+        if (!itypelabel[i].empty()) utils::print(fp, " {} \"\"\" {} \"\"\"", i + 1, itypelabel[i]);
       fputc('\n', fp);
     }
     fclose(fp);
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 2cfb33f14c..f8a1c9950c 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -651,12 +651,12 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) :
     // screen and logfile messages for universe and world
 
     if ((universe->me == 0) && (!helpflag)) {
-      const char fmt[] = "LAMMPS ({})\nRunning on {} partitions of processors\n";
+      constexpr char fmt[] = "LAMMPS ({})\nRunning on {} partitions of processors\n";
       if (universe->uscreen)
-        fmt::print(universe->uscreen,fmt,version,universe->nworlds);
+        utils::print(universe->uscreen,fmt,version,universe->nworlds);
 
       if (universe->ulogfile)
-        fmt::print(universe->ulogfile,fmt,version,universe->nworlds);
+        utils::print(universe->ulogfile,fmt,version,universe->nworlds);
     }
 
     if ((me == 0) && (!helpflag))
@@ -872,7 +872,9 @@ void LAMMPS::create()
   else
     atom->create_avec("atomic",0,nullptr,1);
 
-  group = new Group(this);
+  if (kokkos) group = new GroupKokkos(this);
+  else group = new Group(this);
+
   force = new Force(this);    // must be after group, to create temperature
 
   if (kokkos) modify = new ModifyKokkos(this);
@@ -1452,21 +1454,21 @@ void LAMMPS::print_config(FILE *fp)
   const char *pkg;
   int ncword, ncline = 0;
 
-  fmt::print(fp,"OS: {}\n\n",platform::os_info());
+  utils::print(fp,"OS: {}\n\n",platform::os_info());
 
-  fmt::print(fp,"Compiler: {} with {}\nC++ standard: {}\n",
+  utils::print(fp,"Compiler: {} with {}\nC++ standard: {}\n",
              platform::compiler_info(),platform::openmp_standard(),
              platform::cxx_standard());
   fputs(Info::get_fmt_info().c_str(),fp);
 
   int major,minor;
   std::string infobuf = platform::mpi_info(major,minor);
-  fmt::print(fp,"\nMPI v{}.{}: {}\n\n",major,minor,infobuf);
+  utils::print(fp,"\nMPI v{}.{}: {}\n\n",major,minor,infobuf);
 
-  fmt::print(fp,"Accelerator configuration:\n\n{}\n",
+  utils::print(fp,"Accelerator configuration:\n\n{}\n",
              Info::get_accelerator_info());
 #if defined(LMP_GPU)
-  fmt::print(fp,"Compatible GPU present: {}\n\n",Info::has_gpu_device() ? "yes" : "no");
+  utils::print(fp,"Compatible GPU present: {}\n\n",Info::has_gpu_device() ? "yes" : "no");
 #endif
 
   fputs("FFT information:\n\n",fp);
@@ -1487,14 +1489,14 @@ void LAMMPS::print_config(FILE *fp)
   fputs("-DLAMMPS_SMALLSMALL\n",fp);
 #endif
 
-  fmt::print(fp,"sizeof(smallint): {}-bit\n"
+  utils::print(fp,"sizeof(smallint): {}-bit\n"
              "sizeof(imageint): {}-bit\n"
              "sizeof(tagint):   {}-bit\n"
              "sizeof(bigint):   {}-bit\n",
              sizeof(smallint)*8, sizeof(imageint)*8,
              sizeof(tagint)*8, sizeof(bigint)*8);
 
-  if (Info::has_gzip_support()) fmt::print(fp,"\n{}\n",platform::compress_info());
+  if (Info::has_gzip_support()) utils::print(fp,"\n{}\n",platform::compress_info());
 
   fputs("\nInstalled packages:\n\n",fp);
   for (int i = 0; nullptr != (pkg = installed_packages[i]); ++i) {
diff --git a/src/library.cpp b/src/library.cpp
index c5341eec83..2cd9879e76 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -187,7 +187,7 @@ void *lammps_open(int argc, char **argv, MPI_Comm comm, void **ptr)
   } catch(LAMMPSException &e) {
     lammps_last_global_errormessage = e.what();
 
-    fmt::print(stderr, "LAMMPS Exception: {}", e.what());
+    utils::print(stderr, "LAMMPS Exception: {}", e.what());
     if (ptr) *ptr = nullptr;
   }
   return (void *) lmp;
diff --git a/src/min.cpp b/src/min.cpp
index 931031e44a..e20e4c191f 100644
--- a/src/min.cpp
+++ b/src/min.cpp
@@ -197,10 +197,10 @@ void Min::init()
 void Min::setup(int flag)
 {
   if (comm->me == 0 && screen) {
-    fmt::print(screen,"Setting up {} style minimization ...\n", update->minimize_style);
+    utils::print(screen,"Setting up {} style minimization ...\n", update->minimize_style);
     if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n", update->unit_style);
-      fmt::print(screen,"  Current step  : {}\n", update->ntimestep);
+      utils::print(screen,"  Unit style    : {}\n", update->unit_style);
+      utils::print(screen,"  Current step  : {}\n", update->ntimestep);
       timer->print_timeout(screen);
     }
   }
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index c43f86e7ed..99f0e79efb 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -139,6 +139,7 @@ pairclass(nullptr), pairnames(nullptr), pairmasks(nullptr)
   ago = -1;
 
   cutneighmax = 0.0;
+  cutneighmin = BIG;
   cutneighsq = nullptr;
   cutneighghostsq = nullptr;
   cuttype = nullptr;
@@ -1099,8 +1100,10 @@ int Neighbor::init_pair()
 
   NeighList *ptr;
 
+  // use counter to avoid getting stuck
   int done = 0;
-  while (!done) {
+  int count = 0;
+  while (!done && (count < 100)) {
     done = 1;
     for (i = 0; i < npair_perpetual; i++) {
       for (k = 0; k < 3; k++) {
@@ -1109,8 +1112,9 @@ int Neighbor::init_pair()
         if (k == 1) ptr = lists[plist[i]]->listskip;
         if (k == 2) ptr = lists[plist[i]]->listfull;
         if (ptr == nullptr) continue;
-        for (m = 0; m < nrequest; m++)
+        for (m = 0; m < nrequest; m++) {
           if (ptr == lists[m]) break;
+        }
         for (j = 0; j < npair_perpetual; j++)
           if (m == plist[j]) break;
         if (j < i) continue;
@@ -1122,7 +1126,11 @@ int Neighbor::init_pair()
       }
       if (!done) break;
     }
+    ++count;
   }
+  if (count == 100)
+    error->all(FLERR, "Failed to reorder neighbor lists to satisfy constraints - "
+               "Contact the LAMMPS developers for assistance");
 
   // debug output
 
@@ -1185,8 +1193,8 @@ void Neighbor::morph_unique()
   for (int i = 0; i < nrequest; i++) {
     irq = requests[i];
 
-    // if cut flag set by requestor and cutoff is different than default,
-    //   set unique flag, otherwise unset cut flag
+    // if cut flag set by requestor and cutoff is larger than minimum for default,
+    //   and the list is not a skip list, set unique flag; otherwise unset cut flag
     // this forces Pair,Stencil,Bin styles to be instantiated separately
     // also add skin to cutoff of perpetual lists
 
@@ -1194,7 +1202,7 @@ void Neighbor::morph_unique()
       if (!irq->occasional)
         irq->cutoff += skin;
 
-      if (irq->cutoff != cutneighmax) {
+      if ((irq->cutoff > cutneighmin) && !irq->skip) {
         irq->unique = 1;
       } else {
         irq->cut = 0;
@@ -1510,6 +1518,10 @@ void Neighbor::morph_copy_trim()
 
       if (jrq->copy && jrq->copylist == i) continue;
 
+      // cannot copy or trim if some pair-wise cutoffs are too small
+
+      if (irq->cut && !jrq->cut && (irq->cutoff > cutneighmin)) continue;
+
       // trim a list with longer cutoff
 
       if (irq->cut) icut = irq->cutoff;
diff --git a/src/pair.cpp b/src/pair.cpp
index 5421108eba..8f10d81d99 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -710,10 +710,11 @@ double Pair::mix_energy(double eps1, double eps2, double sig1, double sig2)
     return sqrt(eps1*eps2);
   else if (mix_flag == ARITHMETIC)
     return sqrt(eps1*eps2);
-  else if (mix_flag == SIXTHPOWER)
-    return (2.0 * sqrt(eps1*eps2) * powint(sig1, 3) * powint(sig2, 3)
-            / (powint(sig1, 6) + powint(sig2, 6)));
-  else did_mix = false;
+  else if (mix_flag == SIXTHPOWER) {
+    if ((sig1 != 0.0) && (sig2 != 0.0))
+      return (2.0 * sqrt(eps1*eps2) * powint(sig1, 3) * powint(sig2, 3)
+              / (powint(sig1, 6) + powint(sig2, 6)));
+  } else did_mix = false;
   return 0.0;
 }
 
@@ -1842,7 +1843,7 @@ void Pair::write_file(int narg, char **arg)
       utils::logmesg(lmp,"Creating table file {} with DATE: {}\n",
                      table_file, utils::current_date());
       fp = fopen(table_file.c_str(),"w");
-      if (fp) fmt::print(fp,"# DATE: {} UNITS: {} Created by pair_write\n",
+      if (fp) utils::print(fp,"# DATE: {} UNITS: {} Created by pair_write\n",
                          utils::current_date(), update->unit_style);
     }
     if (fp == nullptr)
diff --git a/src/pair_hybrid_scaled.cpp b/src/pair_hybrid_scaled.cpp
index e4ccc0a3ca..025c468935 100644
--- a/src/pair_hybrid_scaled.cpp
+++ b/src/pair_hybrid_scaled.cpp
@@ -32,9 +32,14 @@ using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 
 PairHybridScaled::PairHybridScaled(LAMMPS *lmp) :
-    PairHybrid(lmp), fsum(nullptr), tsum(nullptr), scaleval(nullptr), scaleidx(nullptr)
+    PairHybrid(lmp), fsum(nullptr), tsum(nullptr), scaleval(nullptr), scaleidx(nullptr),
+    atomvar(nullptr), atomscale(nullptr)
 {
   nmaxfsum = -1;
+
+  // set comm size needed by this Pair (if atomscaleflag)
+
+  comm_forward = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -45,6 +50,8 @@ PairHybridScaled::~PairHybridScaled()
   memory->destroy(tsum);
   delete[] scaleval;
   delete[] scaleidx;
+  delete[] atomvar;
+  memory->destroy(atomscale);
 }
 
 /* ----------------------------------------------------------------------
@@ -66,18 +73,35 @@ void PairHybridScaled::compute(int eflag, int vflag)
   // update scale values from variables where needed
 
   const int nvars = scalevars.size();
+  int atomscaleflag = 0;
   if (nvars > 0) {
     auto vals = new double[nvars];
+    auto vars = new int[nvars];
     for (int k = 0; k < nvars; ++k) {
       int m = input->variable->find(scalevars[k].c_str());
       if (m < 0)
         error->all(FLERR, "Variable '{}' not found when updating scale factors", scalevars[k]);
-      vals[k] = input->variable->compute_equal(m);
+
+      // for equal-style, compute variable, set variable index to -1
+      if (input->variable->equalstyle(m)) {
+        vals[k] = input->variable->compute_equal(m);
+        vars[k] = -1;
+        // for atom-style, store variable index, set variable to 0.0, set atomscaleflag
+      } else if (input->variable->atomstyle(m)) {
+        vals[k] = 0.0;
+        vars[k] = m;
+        atomscaleflag = 1;
+      } else
+        error->all(FLERR, "Variable '{}' has incompatible style", scalevars[k]);
     }
     for (int k = 0; k < nstyles; ++k) {
-      if (scaleidx[k] >= 0) scaleval[k] = vals[scaleidx[k]];
+      if (scaleidx[k] >= 0) {
+        scaleval[k] = vals[scaleidx[k]];
+        atomvar[k] = vars[scaleidx[k]];
+      }
     }
     delete[] vals;
+    delete[] vars;
   }
 
   // check if no_virial_fdotr_compute is set and global component of
@@ -95,9 +119,11 @@ void PairHybridScaled::compute(int eflag, int vflag)
   if (atom->nmax > nmaxfsum) {
     memory->destroy(fsum);
     if (atom->torque_flag) memory->destroy(tsum);
+    if (atomscaleflag) memory->destroy(atomscale);
     nmaxfsum = atom->nmax;
     memory->create(fsum, nmaxfsum, 3, "pair:fsum");
     if (atom->torque_flag) memory->create(tsum, nmaxfsum, 3, "pair:tsum");
+    if (atomscaleflag) memory->create(atomscale, nmaxfsum, "pair:atomscale");
   }
   const int nall = atom->nlocal + atom->nghost;
   auto f = atom->f;
@@ -157,14 +183,34 @@ void PairHybridScaled::compute(int eflag, int vflag)
 
     // add scaled forces to global sum
     const double scale = scaleval[m];
-    for (i = 0; i < nall; ++i) {
-      fsum[i][0] += scale * f[i][0];
-      fsum[i][1] += scale * f[i][1];
-      fsum[i][2] += scale * f[i][2];
-      if (atom->torque_flag) {
-        tsum[i][0] += scale * t[i][0];
-        tsum[i][1] += scale * t[i][1];
-        tsum[i][2] += scale * t[i][2];
+
+    // if scale factor is constant or equal-style variable
+    if (scaleidx[m] < 0 || atomvar[m] < 0) {
+      for (i = 0; i < nall; ++i) {
+        fsum[i][0] += scale * f[i][0];
+        fsum[i][1] += scale * f[i][1];
+        fsum[i][2] += scale * f[i][2];
+        if (atom->torque_flag) {
+          tsum[i][0] += scale * t[i][0];
+          tsum[i][1] += scale * t[i][1];
+          tsum[i][2] += scale * t[i][2];
+        }
+      }
+      // if scale factor is atom-style variable
+    } else {
+      const int igroupall = 0;
+      input->variable->compute_atom(atomvar[m], igroupall, atomscale, 1, 0);
+      comm->forward_comm(this);
+      for (i = 0; i < nall; ++i) {
+        const double ascale = atomscale[i];
+        fsum[i][0] += ascale * f[i][0];
+        fsum[i][1] += ascale * f[i][1];
+        fsum[i][2] += ascale * f[i][2];
+        if (atom->torque_flag) {
+          tsum[i][0] += ascale * t[i][0];
+          tsum[i][1] += ascale * t[i][1];
+          tsum[i][2] += ascale * t[i][2];
+        }
       }
     }
 
@@ -288,6 +334,7 @@ void PairHybridScaled::settings(int narg, char **arg)
 
   scaleval = new double[narg];
   scaleidx = new int[narg];
+  atomvar = new int[narg];
   scalevars.reserve(narg);
 
   // allocate each sub-style
@@ -303,7 +350,8 @@ void PairHybridScaled::settings(int narg, char **arg)
   while (iarg < narg - 1) {
 
     // first process scale factor or variable
-    // idx < 0 indicates constant value otherwise index in variable name list
+    // scaleidx[k] < 0 indicates constant value, otherwise index in variable name list
+    // initialize atomvar[k] to -1, indicates not atom-style variable
 
     double val = 0.0;
     int idx = -1;
@@ -323,6 +371,7 @@ void PairHybridScaled::settings(int narg, char **arg)
     }
     scaleval[nstyles] = val;
     scaleidx[nstyles] = idx;
+    atomvar[nstyles] = -1;
     ++iarg;
 
     if (utils::strmatch(arg[iarg], "^hybrid"))
@@ -387,18 +436,35 @@ double PairHybridScaled::single(int i, int j, int itype, int jtype, double rsq,
   // update scale values from variables where needed
 
   const int nvars = scalevars.size();
+  int atomscaleflag = 0;
   if (nvars > 0) {
     auto vals = new double[nvars];
+    auto vars = new int[nvars];
     for (int k = 0; k < nvars; ++k) {
       int m = input->variable->find(scalevars[k].c_str());
       if (m < 0)
         error->all(FLERR, "Variable '{}' not found when updating scale factors", scalevars[k]);
-      vals[k] = input->variable->compute_equal(m);
+
+      // for equal-style, compute variable, set variable index to -1
+      if (input->variable->equalstyle(m)) {
+        vals[k] = input->variable->compute_equal(m);
+        vars[k] = -1;
+        // for atom-style, store variable index, set variable to 0.0, set atomscaleflag
+      } else if (input->variable->atomstyle(m)) {
+        vals[k] = 0.0;
+        vars[k] = m;
+        atomscaleflag = 1;
+      } else
+        error->all(FLERR, "Variable '{}' has incompatible style", scalevars[k]);
     }
     for (int k = 0; k < nstyles; ++k) {
-      if (scaleidx[k] >= 0) scaleval[k] = vals[scaleidx[k]];
+      if (scaleidx[k] >= 0) {
+        scaleval[k] = vals[scaleidx[k]];
+        atomvar[k] = vars[scaleidx[k]];
+      }
     }
     delete[] vals;
+    delete[] vars;
   }
 
   double fone;
@@ -417,7 +483,18 @@ double PairHybridScaled::single(int i, int j, int itype, int jtype, double rsq,
 
       double scale = scaleval[map[itype][jtype][m]];
       esum += scale * pstyle->single(i, j, itype, jtype, rsq, factor_coul, factor_lj, fone);
-      fforce += scale * fone;
+
+      // if scale factor is constant or equal-style variable
+      if (scaleidx[m] < 0 || atomvar[m] < 0) {
+        fforce += scale * fone;
+        // if scale factor is atom-style variable, average i and j
+      } else {
+        const int igroupall = 0;
+        input->variable->compute_atom(atomvar[m], igroupall, atomscale, 1, 0);
+        comm->forward_comm(this);
+        const double ascale = 0.5 * (atomscale[i] + atomscale[j]);
+        fforce += ascale * fone;
+      }
     }
   }
 
@@ -440,18 +517,35 @@ void PairHybridScaled::born_matrix(int i, int j, int itype, int jtype, double rs
   // update scale values from variables where needed
 
   const int nvars = scalevars.size();
+  int atomscaleflag = 0;
   if (nvars > 0) {
-    double *vals = new double[nvars];
+    auto vals = new double[nvars];
+    auto vars = new int[nvars];
     for (int k = 0; k < nvars; ++k) {
       int m = input->variable->find(scalevars[k].c_str());
       if (m < 0)
         error->all(FLERR, "Variable '{}' not found when updating scale factors", scalevars[k]);
-      vals[k] = input->variable->compute_equal(m);
+
+      // for equal-style, compute variable, set variable index to -1
+      if (input->variable->equalstyle(m)) {
+        vals[k] = input->variable->compute_equal(m);
+        vars[k] = -1;
+        // for atom-style, store variable index, set variable to 0.0, set atomscaleflag
+      } else if (input->variable->atomstyle(m)) {
+        vals[k] = 0.0;
+        vars[k] = m;
+        atomscaleflag = 1;
+      } else
+        error->all(FLERR, "Variable '{}' has incompatible style", scalevars[k]);
     }
     for (int k = 0; k < nstyles; ++k) {
-      if (scaleidx[k] >= 0) scaleval[k] = vals[scaleidx[k]];
+      if (scaleidx[k] >= 0) {
+        scaleval[k] = vals[scaleidx[k]];
+        atomvar[k] = vars[scaleidx[k]];
+      }
     }
     delete[] vals;
+    delete[] vars;
   }
 
   double du, du2, scale;
@@ -460,18 +554,30 @@ void PairHybridScaled::born_matrix(int i, int j, int itype, int jtype, double rs
   for (int m = 0; m < nmap[itype][jtype]; m++) {
     auto pstyle = styles[map[itype][jtype][m]];
     if (rsq < pstyle->cutsq[itype][jtype]) {
-      if (pstyle->born_matrix_enable == 0)
-        error->one(FLERR, "Pair hybrid sub-style does not support born_matrix call");
+      if (pstyle->single_enable == 0)
+        error->one(FLERR, "Pair hybrid sub-style does not support single call");
 
       if ((special_lj[map[itype][jtype][m]] != nullptr) ||
           (special_coul[map[itype][jtype][m]] != nullptr))
-        error->one(FLERR, "Pair hybrid born_matrix() does not support per sub-style special_bond");
+        error->one(FLERR, "Pair hybrid single() does not support per sub-style special_bond");
 
       du = du2 = 0.0;
-      scale = scaleval[map[itype][jtype][m]];
+      double scale = scaleval[map[itype][jtype][m]];
       pstyle->born_matrix(i, j, itype, jtype, rsq, factor_coul, factor_lj, du, du2);
-      dupair += scale * du;
-      du2pair += scale * du2;
+
+      // if scale factor is constant or equal-style variable
+      if (scaleidx[m] < 0 || atomvar[m] < 0) {
+        dupair += scale * du;
+        du2pair += scale * du2;
+        // if scale factor is atom-style variable, average i and j
+      } else {
+        const int igroupall = 0;
+        input->variable->compute_atom(atomvar[m], igroupall, atomscale, 1, 0);
+        comm->forward_comm(this);
+        const double ascale = 0.5 * (atomscale[i] + atomscale[j]);
+        dupair += ascale * du;
+        du2pair += ascale * du2;
+      }
     }
   }
 }
@@ -574,6 +680,7 @@ void PairHybridScaled::write_restart(FILE *fp)
 
   fwrite(scaleval, sizeof(double), nstyles, fp);
   fwrite(scaleidx, sizeof(int), nstyles, fp);
+  fwrite(atomvar, sizeof(int), nstyles, fp);
 
   int n = scalevars.size();
   fwrite(&n, sizeof(int), 1, fp);
@@ -594,17 +701,21 @@ void PairHybridScaled::read_restart(FILE *fp)
 
   delete[] scaleval;
   delete[] scaleidx;
+  delete[] atomvar;
   scalevars.clear();
   scaleval = new double[nstyles];
   scaleidx = new int[nstyles];
+  atomvar = new int[nstyles];
 
   int n, me = comm->me;
   if (me == 0) {
     utils::sfread(FLERR, scaleval, sizeof(double), nstyles, fp, nullptr, error);
     utils::sfread(FLERR, scaleidx, sizeof(int), nstyles, fp, nullptr, error);
+    utils::sfread(FLERR, atomvar, sizeof(int), nstyles, fp, nullptr, error);
   }
   MPI_Bcast(scaleval, nstyles, MPI_DOUBLE, 0, world);
   MPI_Bcast(scaleidx, nstyles, MPI_INT, 0, world);
+  MPI_Bcast(atomvar, nstyles, MPI_INT, 0, world);
 
   char *tmp;
   if (me == 0) utils::sfread(FLERR, &n, sizeof(int), 1, fp, nullptr, error);
@@ -667,3 +778,28 @@ void PairHybridScaled::copy_svector(int itype, int jtype)
     }
   }
 }
+
+/* ---------------------------------------------------------------------- */
+
+int PairHybridScaled::pack_forward_comm(int n, int *list, double *buf, int /*pbc_flag*/, int * /*pbc*/)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = atomscale[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHybridScaled::unpack_forward_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) atomscale[i] = buf[m++];
+}
diff --git a/src/pair_hybrid_scaled.h b/src/pair_hybrid_scaled.h
index 2af5b61f8d..681de73aa3 100644
--- a/src/pair_hybrid_scaled.h
+++ b/src/pair_hybrid_scaled.h
@@ -44,12 +44,17 @@ class PairHybridScaled : public PairHybrid {
   void init_svector() override;
   void copy_svector(int, int) override;
 
+  int pack_forward_comm(int, int *, double *, int, int *) override;
+  void unpack_forward_comm(int, int, double *) override;
+
  protected:
   double **fsum, **tsum;
   double *scaleval;
   int *scaleidx;
   std::vector<std::string> scalevars;
   int nmaxfsum;
+  int *atomvar;         // indices of atom-style variables
+  double *atomscale;    // vector of atom-style variable values
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/respa.cpp b/src/respa.cpp
index fb54582553..086371ecbb 100644
--- a/src/respa.cpp
+++ b/src/respa.cpp
@@ -125,6 +125,8 @@ Respa::Respa(LAMMPS *lmp, int narg, char **arg) :
       nhybrid_styles = hybrid->nstyles;
       // each hybrid sub-style needs to be assigned to a respa level
       if (iarg + nhybrid_styles > narg) error->all(FLERR, "Illegal run_style respa command");
+      delete[] hybrid_level;
+      delete[] hybrid_compute;
       hybrid_level = new int[nhybrid_styles];
       hybrid_compute = new int[nhybrid_styles];
       for (int i = 0; i < nhybrid_styles; ++i) {
diff --git a/src/thermo.cpp b/src/thermo.cpp
index 35b5016118..6d0e18e6d3 100644
--- a/src/thermo.cpp
+++ b/src/thermo.cpp
@@ -87,7 +87,8 @@ static constexpr char YAML[] = "step temp ke pe ebond eangle edihed eimp evdwl e
 #define FORMAT_FLOAT_YAML_DEFAULT "%.15g"
 #define FORMAT_INT_YAML_DEFAULT "%d"
 
-#define FORMAT_MULTI_HEADER "------------ Step {:14} ----- CPU = {:12.7g} (sec) -------------"
+static constexpr char FORMAT_MULTI_HEADER[] =
+  "------------ Step {:14} ----- CPU = {:12.7g} (sec) -------------";
 
 enum { SCALAR, VECTOR, ARRAY };
 
diff --git a/src/utils.cpp b/src/utils.cpp
index 0f5b50baf2..3f571991e1 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -111,6 +111,55 @@ bool utils::strmatch(const std::string &text, const std::string &pattern)
   return (pos >= 0);
 }
 
+bool utils::strsame(const std::string &text1, const std::string &text2)
+{
+  const char *ptr1 = text1.c_str();
+  const char *ptr2 = text2.c_str();
+
+  while (*ptr1 && *ptr2) {
+
+    // ignore whitespace
+    while (*ptr1 && isspace(*ptr1)) ++ptr1;
+    while (*ptr2 && isspace(*ptr2)) ++ptr2;
+
+    // strings differ
+    if (*ptr1 != *ptr2) return false;
+
+    // reached end of both strings
+    if (!*ptr1 && !*ptr2) return true;
+
+    ++ptr1;
+    ++ptr2;
+  }
+  return true;
+}
+
+std::string utils::strcompress(const std::string &text)
+{
+  const char *ptr = text.c_str();
+  std::string output;
+
+  // remove leading whitespace
+  while (*ptr && isspace(*ptr)) ++ptr;
+
+  while (*ptr) {
+    // copy non-blank characters
+    while (*ptr && !isspace(*ptr)) output += *ptr++;
+
+    if (!*ptr) break;
+
+    // add one blank only
+    if (isspace(*ptr)) output += ' ';
+
+    // skip additional blanks
+    while (*ptr && isspace(*ptr)) ++ptr;
+  }
+
+  // remove trailing blank
+  if (output.back() == ' ') output.erase(output.size() - 1, 1);
+  return output;
+}
+
 /** This function is a companion function to utils::strmatch(). Arguments
  *  and logic is the same, but instead of a boolean, it returns the
  *  sub-string that matches the regex pattern.  There can be only one match.
@@ -132,6 +181,70 @@ void utils::missing_cmd_args(const std::string &file, int line, const std::strin
   if (error) error->all(file, line, "Illegal {} command: missing argument(s)", cmd);
 }
 
+std::string utils::point_to_error(Input *input, int failed)
+{
+  if (input && input->line && input->command) {
+    std::string lastline = utils::strcompress(input->line);
+    std::string lastargs = input->command;
+    std::string cmdline = "Last input line: ";
+
+    // extended output
+    if (failed > Error::NOPOINTER) {
+
+      // indicator points to command by default
+      int indicator = 0;
+      int quoted = 0;
+      lastargs += ' ';
+
+      // assemble pre-processed command line and update error indicator position, if needed.
+      for (int i = 0; i < input->narg; ++i) {
+        std::string inputarg = input->arg[i];
+        if (i == failed) indicator = lastargs.size();
+
+        // argument contains whitespace. add quotes. check which type of quotes, too
+        if (inputarg.find_first_of(" \t\n") != std::string::npos) {
+          if (i == failed) quoted = 2;
+          if (inputarg.find_first_of('"') != std::string::npos) {
+            lastargs += "'";
+            lastargs += inputarg;
+            lastargs += "'";
+          } else {
+            lastargs += '"';
+            lastargs += inputarg;
+            lastargs += '"';
+          }
+        } else
+          lastargs += inputarg;
+        lastargs += ' ';
+      }
+
+      indicator += cmdline.size();
+      // the string is unchanged by substitution (ignoring whitespace), print output only once
+      if (utils::strsame(lastline, lastargs)) {
+        cmdline += lastargs;
+      } else {
+        cmdline += lastline;
+        cmdline += '\n';
+        // must have the same number of chars as "Last input line: " used in the previous line
+        cmdline += "--> parsed line: ";
+        cmdline += lastargs;
+      }
+
+      // construct and append error indicator line
+      cmdline += '\n';
+      cmdline += std::string(indicator, ' ');
+      cmdline += std::string(strlen(input->arg[failed]) + quoted, '^');
+      cmdline += '\n';
+
+    } else {
+      cmdline += lastline;
+      cmdline += '\n';
+    }
+    return cmdline;
+  } else
+    return std::string("");
+}
+
 /* specialization for the case of just a single string argument */
 
 void utils::logmesg(LAMMPS *lmp, const std::string &mesg)
@@ -149,6 +262,22 @@ void utils::fmtargs_logmesg(LAMMPS *lmp, fmt::string_view format, fmt::format_ar
   }
 }
 
+/* specialization for the case of just a single string argument */
+
+void utils::print(FILE *fp, const std::string &mesg)
+{
+  fputs(mesg.c_str(), fp);
+}
+
+void utils::fmtargs_print(FILE *fp, fmt::string_view format, fmt::format_args args)
+{
+  try {
+    print(fp, fmt::vformat(format, args));
+  } catch (fmt::format_error &) {
+    ; // do nothing
+  }
+}
+
 std::string utils::errorurl(int errorcode)
 {
   return fmt::format("\nFor more information see https://docs.lammps.org/err{:04d}", errorcode);
@@ -646,14 +775,14 @@ tagint utils::tnumeric(const char *file, int line, const char *str, bool do_abor
 // clang-format off
 template <typename TYPE>
 void utils::bounds(const char *file, int line, const std::string &str,
-                   bigint nmin, bigint nmax, TYPE &nlo, TYPE &nhi, Error *error)
+                   bigint nmin, bigint nmax, TYPE &nlo, TYPE &nhi, Error *error, int failed)
 {
   nlo = nhi = -1;
 
   // check for illegal characters
   size_t found = str.find_first_not_of("*-0123456789");
   if (found != std::string::npos) {
-    if (error) error->all(file, line, "Invalid range string: {}", str);
+    if (error) error->all(file, line, failed, "Invalid range string: {}", str);
     return;
   }
 
@@ -676,23 +805,23 @@ void utils::bounds(const char *file, int line, const std::string &str,
 
   if (error) {
     if ((nlo <= 0) || (nhi <= 0))
-      error->all(file, line, "Invalid range string: {}", str);
+      error->all(file, line, failed, "Invalid range string: {}", str);
 
     if (nlo < nmin)
-      error->all(file, line, "Numeric index {} is out of bounds ({}-{})", nlo, nmin, nmax);
+      error->all(file, line, failed, "Numeric index {} is out of bounds ({}-{})", nlo, nmin, nmax);
     else if (nhi > nmax)
-      error->all(file, line, "Numeric index {} is out of bounds ({}-{})", nhi, nmin, nmax);
+      error->all(file, line, failed, "Numeric index {} is out of bounds ({}-{})", nhi, nmin, nmax);
     else if (nlo > nhi)
-      error->all(file, line, "Numeric index {} is out of bounds ({}-{})", nlo, nmin, nhi);
+      error->all(file, line, failed, "Numeric index {} is out of bounds ({}-{})", nlo, nmin, nhi);
   }
 }
 
 template void utils::bounds<>(const char *, int, const std::string &,
-                              bigint, bigint, int &, int &, Error *);
+                              bigint, bigint, int &, int &, Error *, int);
 template void utils::bounds<>(const char *, int, const std::string &,
-                              bigint, bigint, long &, long &, Error *);
+                              bigint, bigint, long &, long &, Error *, int);
 template void utils::bounds<>(const char *, int, const std::string &,
-                              bigint, bigint, long long &, long long &, Error *);
+                              bigint, bigint, long long &, long long &, Error *, int);
 
 // clang-format on
 /* ----------------------------------------------------------------------
@@ -732,7 +861,7 @@ template void utils::bounds_typelabel<>(const char *, int, const std::string &,
 ------------------------------------------------------------------------- */
 
 int utils::expand_args(const char *file, int line, int narg, char **arg, int mode, char **&earg,
-                       LAMMPS *lmp)
+                       LAMMPS *lmp, int **argmap)
 {
   int iarg;
 
@@ -747,10 +876,18 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
     return narg;
   }
 
+  // determine argument offset, if possible
+  int ioffset = 0;
+  if (lmp->input->arg) {
+    for (int i = 0; i < lmp->input->narg; ++i)
+      if (lmp->input->arg[i] == arg[0]) ioffset = i;
+  }
+
   // maxarg should always end up equal to newarg, so caller can free earg
 
   int maxarg = narg - iarg;
-  earg = (char **) lmp->memory->smalloc(maxarg * sizeof(char *), "input:earg");
+  earg = (char **) lmp->memory->smalloc(maxarg * sizeof(char *), "expand_args:earg");
+  int *amap = (int *) lmp->memory->smalloc(maxarg * sizeof(int), "expand_args:amap");
 
   int newarg = 0, expandflag, nlo, nhi, nmax;
   std::string id, wc, tail;
@@ -813,16 +950,18 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
       // expand wild card string to nlo/nhi numbers
 
       if (expandflag) {
-        utils::bounds(file, line, wc, 1, nmax, nlo, nhi, lmp->error);
+        utils::bounds(file, line, wc, 1, nmax, nlo, nhi, lmp->error, iarg + ioffset);
 
         if (newarg + nhi - nlo + 1 > maxarg) {
           maxarg += nhi - nlo + 1;
-          earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "input:earg");
+          earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "expand_args:earg");
+          amap = (int *) lmp->memory->srealloc(amap, maxarg * sizeof(char *), "expand_args:amap");
         }
 
         for (int index = nlo; index <= nhi; index++) {
           earg[newarg] =
               utils::strdup(fmt::format("{}:{}:{}[{}]{}", gridid[0], gridid[1], id, index, tail));
+          amap[newarg] = iarg;
           newarg++;
         }
       }
@@ -900,7 +1039,7 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
 
         if (index >= 0) {
           if (mode == 0 && lmp->input->variable->vectorstyle(index)) {
-            utils::bounds(file, line, wc, 1, MAXSMALLINT, nlo, nhi, lmp->error);
+            utils::bounds(file, line, wc, 1, MAXSMALLINT, nlo, nhi, lmp->error, iarg + ioffset);
             if (nhi < MAXSMALLINT) {
               nmax = nhi;
               expandflag = 1;
@@ -931,12 +1070,12 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
       if (expandflag) {
 
         // expand wild card string to nlo/nhi numbers
-
-        utils::bounds(file, line, wc, 1, nmax, nlo, nhi, lmp->error);
+        utils::bounds(file, line, wc, 1, nmax, nlo, nhi, lmp->error, iarg + ioffset);
 
         if (newarg + nhi - nlo + 1 > maxarg) {
           maxarg += nhi - nlo + 1;
-          earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "input:earg");
+          earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "expand_args:earg");
+          amap = (int *) lmp->memory->srealloc(amap, maxarg * sizeof(char *), "expand_args:amap");
         }
 
         for (int index = nlo; index <= nhi; index++) {
@@ -944,6 +1083,7 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
             earg[newarg] = utils::strdup(fmt::format("{}2_{}[{}]{}", word[0], id, index, tail));
           else
             earg[newarg] = utils::strdup(fmt::format("{}_{}[{}]{}", word[0], id, index, tail));
+          amap[newarg] = iarg;
           newarg++;
         }
       }
@@ -954,14 +1094,21 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, int mod
     if (!expandflag) {
       if (newarg == maxarg) {
         maxarg++;
-        earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "input:earg");
+        earg = (char **) lmp->memory->srealloc(earg, maxarg * sizeof(char *), "expand_args:earg");
+        amap = (int *) lmp->memory->srealloc(amap, maxarg * sizeof(char *), "expand_args:amap");
       }
       earg[newarg] = utils::strdup(word);
+      amap[newarg] = iarg;
       newarg++;
     }
   }
 
-  // printf("NEWARG %d\n",newarg); for (int i = 0; i < newarg; i++) printf("  arg %d: %s\n",i,earg[i]);
+  if (argmap)
+    *argmap = amap;
+  else
+    lmp->memory->sfree(amap);
+
+  // fprintf(stderr, "NEWARG %d\n",newarg); for (int i = 0; i < newarg; i++) printf("  arg %d: %s %d\n",i,earg[i], amap ? amap[i] : -1);
   return newarg;
 }
 
diff --git a/src/utils.h b/src/utils.h
index 1ed514cca4..5de7dda82e 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -28,18 +28,34 @@ namespace LAMMPS_NS {
 
 // forward declarations
 class Error;
+class Input;
 class LAMMPS;
 
 namespace utils {
 
   /*! Match text against a simplified regex pattern
    *
-   *  \param text the text to be matched against the pattern
-   *  \param pattern the search pattern, which may contain regexp markers
+   *  \param  text     the text to be matched against the pattern
+   *  \param  pattern  the search pattern, which may contain regexp markers
    *  \return true if the pattern matches, false if not */
 
   bool strmatch(const std::string &text, const std::string &pattern);
 
+  /*! Compare two string while ignoring whitespace
+   *
+   *  \param  text1   the first text to be compared
+   *  \param  text2   the second text to be compared
+   *  \return true if the non-whitespace part of the two strings matches, false if not */
+
+  bool strsame(const std::string &text1, const std::string &text2);
+
+  /*! Compress whitespace in a string
+   *
+   *  \param  text  the text to be compressed
+   *  \return string with whitespace compressed to single blanks */
+
+  std::string strcompress(const std::string &text);
+
   /*! Find sub-string that matches a simplified regex pattern
    *
    *  \param text the text to be matched against the pattern
@@ -59,6 +75,15 @@ namespace utils {
 
   void missing_cmd_args(const std::string &file, int line, const std::string &cmd, Error *error);
 
+  /*! Create string with last command and optionally pointing to arg with error
+   *
+   * This function is a helper function for error messages.  It creates
+   *
+   *  \param input   pointer to the Input class instance (for access to last command args)
+   *  \param faile   index of the faulty argument (-1 to point to the command itself)
+   *  \return        string with two lines: the pre-processed command and a '^' pointing to the faulty argument */
+  std::string point_to_error(Input *input, int failed);
+
   /*! Internal function handling the argument list for logmesg(). */
 
   void fmtargs_logmesg(LAMMPS *lmp, fmt::string_view format, fmt::format_args args);
@@ -67,8 +92,8 @@ namespace utils {
    *
    * This function simplifies the repetitive task of outputting some
    * message to both the screen and/or the log file. The template
-   * wrapper with fmtlib format and argument processing allows
-   * this function to work similar to ``fmt::print()``.
+   * wrapper with {fmt} formatting and argument processing allows
+   * this function to work similar to :cpp:func:`utils::print() <LAMMPS_NS::utils::print>`.
    *
    *  \param lmp    pointer to LAMMPS class instance
    *  \param format format string of message to be printed
@@ -86,6 +111,40 @@ namespace utils {
 
   void logmesg(LAMMPS *lmp, const std::string &mesg);
 
+  /*! Return text redirecting the user to a specific paragraph in the manual
+   *
+   * The LAMMPS manual contains detailed explanations for errors and
+   * warnings where a simple error message may not be sufficient.  These can
+   * be reached through URLs with a numeric code.  This function creates the
+   * corresponding text to be included into the error message that redirects
+   * the user to that URL.
+   *
+   *  \param errorcode   number pointing to a paragraph in the manual */
+
+  /*! Internal function handling the argument list for print(). */
+
+  void fmtargs_print(FILE *fp, fmt::string_view format, fmt::format_args args);
+
+  /*! Write formatted message to file
+   *
+   * This function implements a version of fprintf() that uses {fmt} formatting
+   *
+   *  \param fp     stdio FILE pointer
+   *  \param format format string of message to be printed
+   *  \param args   arguments to format string */
+
+  template <typename... Args> void print(FILE *fp, const std::string &format, Args &&...args)
+  {
+    fmtargs_print(fp, format, fmt::make_format_args(args...));
+  }
+
+  /*! \overload
+   *
+   *  \param fp     stdio FILE pointer
+   *  \param mesg   string with message to be printed */
+
+  void print(FILE *fp, const std::string &mesg);
+
   /*! Return text redirecting the user to a specific paragraph in the manual
    *
    * The LAMMPS manual contains detailed explanations for errors and
@@ -326,11 +385,12 @@ namespace utils {
    * \param nmax     largest allowed upper bound
    * \param nlo      lower bound
    * \param nhi      upper bound
-   * \param error    pointer to Error class for out-of-bounds messages */
+   * \param error    pointer to Error class for out-of-bounds messages
+   * \param failed   argument index with failed expansion (optional) */
 
   template <typename TYPE>
   void bounds(const char *file, int line, const std::string &str, bigint nmin, bigint nmax,
-              TYPE &nlo, TYPE &nhi, Error *error);
+              TYPE &nlo, TYPE &nhi, Error *error, int failed = -2);    // -2 = Error::NOPOINTER
 
   /*! Same as utils::bounds(), but string may be a typelabel
    *
@@ -376,17 +436,23 @@ This functions adds the following case to :cpp:func:`utils::bounds() <LAMMPS_NS:
    *  caller. Otherwise arg and earg will point to the same address
    *  and no explicit de-allocation is needed by the caller.
    *
-   * \param file  name of source file for error message
-   * \param line  line number in source file for error message
-   * \param narg  number of arguments in current list
-   * \param arg   argument list, possibly containing wildcards
-   * \param mode  select between global vectors(=0) and arrays (=1)
-   * \param earg  new argument list with wildcards expanded
-   * \param lmp   pointer to top-level LAMMPS class instance
+   *  The *argmap* pointer to an int pointer may be used to accept an array
+   *  of integers mapping the arguments after the expansion to their original
+   *  index.  If this pointer is NULL (the default) than this map is not created.
+   *  Otherwise, it must be deallocated by the calling code.
+   *
+   * \param file    name of source file for error message
+   * \param line    line number in source file for error message
+   * \param narg    number of arguments in current list
+   * \param arg     argument list, possibly containing wildcards
+   * \param mode    select between global vectors(=0) and arrays (=1)
+   * \param earg    new argument list with wildcards expanded
+   * \param lmp     pointer to top-level LAMMPS class instance
+   * \param argmap  pointer to integer pointer for mapping expanded indices to input (optional)
    * \return      number of arguments in expanded list */
 
   int expand_args(const char *file, int line, int narg, char **arg, int mode, char **&earg,
-                  LAMMPS *lmp);
+                  LAMMPS *lmp, int **argmap = nullptr);
 
   /*! Expand type label string into its equivalent numeric type
    *
@@ -397,9 +463,9 @@ This functions adds the following case to :cpp:func:`utils::bounds() <LAMMPS_NS:
    *  pointer is returned.
    *  If a string is returned, the calling code must free it with delete[].
    *
-   * \param file  name of source file for error message
-   * \param line  line number in source file for error message
-   * \param str   type string to be expanded
+   * \param file   name of source file for error message
+   * \param line   line number in source file for error message
+   * \param str    type string to be expanded
    * \param mode  select labelmap using constants from Atom class
    * \param lmp   pointer to top-level LAMMPS class instance
    * \return      pointer to expanded string or null pointer */
diff --git a/src/variable.cpp b/src/variable.cpp
index 031709166b..d20a134d0d 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -5227,14 +5227,14 @@ void Variable::print_var_error(const std::string &srcfile, const int lineno,
   if ((ivar >= 0) && (ivar < nvar)) {
     std::string msg = fmt::format("Variable {}: ",names[ivar]) + errmsg;
     if (global)
-      error->all(srcfile,lineno,msg);
+      error->all(srcfile, lineno, Error::NOLASTLINE, msg);
     else
-      error->one(srcfile,lineno,msg);
+      error->one(srcfile, lineno, Error::NOLASTLINE, msg);
   } else {
     if (global)
-      error->all(srcfile,lineno,errmsg);
+      error->all(srcfile,lineno, Error::NOLASTLINE, errmsg);
     else
-      error->one(srcfile,lineno,errmsg);
+      error->one(srcfile,lineno, Error::NOLASTLINE, errmsg);
   }
 }
 
diff --git a/src/verlet.cpp b/src/verlet.cpp
index 0222a0d2a0..f0a307bdd2 100644
--- a/src/verlet.cpp
+++ b/src/verlet.cpp
@@ -95,7 +95,7 @@ void Verlet::setup(int flag)
   if (comm->me == 0 && screen) {
     fputs("Setting up Verlet run ...\n",screen);
     if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n"
+      utils::print(screen,"  Unit style    : {}\n"
                         "  Current step  : {}\n"
                         "  Time step     : {}\n",
                  update->unit_style,update->ntimestep,update->dt);
diff --git a/src/write_data.cpp b/src/write_data.cpp
index 76d1c598ca..85950f3b47 100644
--- a/src/write_data.cpp
+++ b/src/write_data.cpp
@@ -274,41 +274,41 @@ void WriteData::write(const std::string &file)
 
 void WriteData::header()
 {
-  fmt::print(fp,"LAMMPS data file via write_data, version {}, timestep = {}, units = {}\n\n",
+  utils::print(fp,"LAMMPS data file via write_data, version {}, timestep = {}, units = {}\n\n",
              lmp->version, update->ntimestep, update->unit_style);
 
-  fmt::print(fp,"{} atoms\n{} atom types\n",atom->natoms,atom->ntypes);
+  utils::print(fp,"{} atoms\n{} atom types\n",atom->natoms,atom->ntypes);
 
   // only write out number of types for atom style template
 
   if (atom->molecular == Atom::MOLECULAR) {
     if (atom->nbonds || atom->nbondtypes)
-      fmt::print(fp,"{} bonds\n{} bond types\n",
+      utils::print(fp,"{} bonds\n{} bond types\n",
                  nbonds,atom->nbondtypes);
     if (atom->nangles || atom->nangletypes)
-      fmt::print(fp,"{} angles\n{} angle types\n",
+      utils::print(fp,"{} angles\n{} angle types\n",
                  nangles,atom->nangletypes);
     if (atom->ndihedrals || atom->ndihedraltypes)
-      fmt::print(fp,"{} dihedrals\n{} dihedral types\n",
+      utils::print(fp,"{} dihedrals\n{} dihedral types\n",
                  ndihedrals,atom->ndihedraltypes);
     if (atom->nimpropers || atom->nimpropertypes)
-      fmt::print(fp,"{} impropers\n{} improper types\n",
+      utils::print(fp,"{} impropers\n{} improper types\n",
                  nimpropers,atom->nimpropertypes);
   }
 
   if (atom->molecular == Atom::TEMPLATE) {
-    if (atom->nbondtypes) fmt::print(fp,"{} bond types\n",atom->nbondtypes);
-    if (atom->nangletypes) fmt::print(fp,"{} angle types\n",atom->nangletypes);
-    if (atom->ndihedraltypes) fmt::print(fp,"{} dihedral types\n",atom->ndihedraltypes);
-    if (atom->nimpropertypes) fmt::print(fp,"{} improper types\n",atom->nimpropertypes);
+    if (atom->nbondtypes) utils::print(fp,"{} bond types\n",atom->nbondtypes);
+    if (atom->nangletypes) utils::print(fp,"{} angle types\n",atom->nangletypes);
+    if (atom->ndihedraltypes) utils::print(fp,"{} dihedral types\n",atom->ndihedraltypes);
+    if (atom->nimpropertypes) utils::print(fp,"{} improper types\n",atom->nimpropertypes);
   }
 
   // bonus info
 
-  if (atom->ellipsoid_flag) fmt::print(fp,"{} ellipsoids\n",atom->nellipsoids);
-  if (atom->line_flag) fmt::print(fp,"{} lines\n",atom->nlines);
-  if (atom->tri_flag) fmt::print(fp,"{} triangles\n",atom->ntris);
-  if (atom->body_flag) fmt::print(fp,"{} bodies\n",atom->nbodies);
+  if (atom->ellipsoid_flag) utils::print(fp,"{} ellipsoids\n",atom->nellipsoids);
+  if (atom->line_flag) utils::print(fp,"{} lines\n",atom->nlines);
+  if (atom->tri_flag) utils::print(fp,"{} triangles\n",atom->ntris);
+  if (atom->body_flag) utils::print(fp,"{} bodies\n",atom->nbodies);
 
   // fix info
 
@@ -321,19 +321,19 @@ void WriteData::header()
   // box info: orthogonal, restricted triclinic, or general triclinic (if requested)
 
   if (!domain->triclinic_general) {
-    fmt::print(fp,"\n{} {} xlo xhi\n{} {} ylo yhi\n{} {} zlo zhi\n",
+    utils::print(fp,"\n{} {} xlo xhi\n{} {} ylo yhi\n{} {} zlo zhi\n",
                domain->boxlo[0],domain->boxhi[0],
                domain->boxlo[1],domain->boxhi[1],
                domain->boxlo[2],domain->boxhi[2]);
     if (domain->triclinic)
-      fmt::print(fp,"{} {} {} xy xz yz\n",domain->xy,domain->xz,domain->yz);
+      utils::print(fp,"{} {} {} xy xz yz\n",domain->xy,domain->xz,domain->yz);
 
   } else if (domain->triclinic_general) {
-    fmt::print(fp,"\n{} {} {} avec\n{} {} {} bvec\n{} {} {} cvec\n",
+    utils::print(fp,"\n{} {} {} avec\n{} {} {} bvec\n{} {} {} cvec\n",
                domain->avec[0],domain->avec[1],domain->avec[2],
                domain->bvec[0],domain->bvec[1],domain->bvec[2],
                domain->cvec[0],domain->cvec[1],domain->cvec[2]);
-    fmt::print(fp,"{} {} {} abc origin\n",
+    utils::print(fp,"{} {} {} abc origin\n",
                domain->boxlo[0],domain->boxlo[1],domain->boxlo[2]);
   }
 }
@@ -348,7 +348,7 @@ void WriteData::type_arrays()
     double *mass = atom->mass;
     fputs("\nMasses\n\n",fp);
     for (int i = 1; i <= atom->ntypes; i++)
-      fmt::print(fp,"{} {:.16g}\n",i,mass[i]);
+      utils::print(fp,"{} {:.16g}\n",i,mass[i]);
   }
 }
 
@@ -363,7 +363,7 @@ void WriteData::force_fields()
       if ((comm->me == 0) && (force->pair->mixed_flag == 0))
         error->warning(FLERR,"Not all mixed pair coeffs generated from mixing. "
                        "Use write_data with 'pair ij' option to store all pair coeffs.");
-      fmt::print(fp,"\nPair Coeffs # {}\n\n", force->pair_style);
+      utils::print(fp,"\nPair Coeffs # {}\n\n", force->pair_style);
       force->pair->write_data(fp);
     } else if (pairflag == IJ) {
       // try computing mixed pair coeffs in case we skipped lmp->init()
@@ -375,24 +375,24 @@ void WriteData::force_fields()
             if (!force->pair->setflag[i][j])
               force->pair->init_one(i, j);
       }
-      fmt::print(fp,"\nPairIJ Coeffs # {}\n\n", force->pair_style);
+      utils::print(fp,"\nPairIJ Coeffs # {}\n\n", force->pair_style);
       force->pair->write_data_all(fp);
     }
   }
   if (force->bond && force->bond->writedata && atom->nbondtypes) {
-    fmt::print(fp,"\nBond Coeffs # {}\n\n", force->bond_style);
+    utils::print(fp,"\nBond Coeffs # {}\n\n", force->bond_style);
     force->bond->write_data(fp);
   }
   if (force->angle && force->angle->writedata && atom->nangletypes) {
-    fmt::print(fp,"\nAngle Coeffs # {}\n\n", force->angle_style);
+    utils::print(fp,"\nAngle Coeffs # {}\n\n", force->angle_style);
     force->angle->write_data(fp);
   }
   if (force->dihedral && force->dihedral->writedata && atom->ndihedraltypes) {
-    fmt::print(fp,"\nDihedral Coeffs # {}\n\n", force->dihedral_style);
+    utils::print(fp,"\nDihedral Coeffs # {}\n\n", force->dihedral_style);
     force->dihedral->write_data(fp);
   }
   if (force->improper && force->improper->writedata && atom->nimpropertypes) {
-    fmt::print(fp,"\nImproper Coeffs # {}\n\n", force->improper_style);
+    utils::print(fp,"\nImproper Coeffs # {}\n\n", force->improper_style);
     force->improper->write_data(fp);
   }
 }
@@ -429,7 +429,7 @@ void WriteData::atoms()
     MPI_Status status;
     MPI_Request request;
 
-    fmt::print(fp,"\nAtoms # {}\n\n",atom->atom_style);
+    utils::print(fp,"\nAtoms # {}\n\n",atom->atom_style);
     for (int iproc = 0; iproc < comm->nprocs; iproc++) {
       if (iproc) {
         MPI_Irecv(&buf[0][0],maxrow*ncol,MPI_DOUBLE,iproc,0,world,&request);
diff --git a/tools/coding_standard/fmtlib.py b/tools/coding_standard/fmtlib.py
new file mode 100644
index 0000000000..8128a3c747
--- /dev/null
+++ b/tools/coding_standard/fmtlib.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# Utility for detecting fmtlib related issues
+#
+# Currently it checks for the following issues
+# use of fmt::print() instead of utils::print()
+#
+# Written by Axel Kohlmeyer (Temple University)
+from __future__ import print_function
+import sys
+
+if sys.version_info.major < 3:
+    sys.exit('This script must be run with Python 3.5 or later')
+
+if sys.version_info.minor < 5:
+    sys.exit('This script must be run with Python 3.5 or later')
+
+import os
+import glob
+import re
+import yaml
+import argparse
+import shutil
+
+DEFAULT_CONFIG = """
+recursive: true
+include:
+    - src/**
+exclude:
+    - "src/fmt/"
+    - "src/fmtlib"
+    - "src/utils"
+patterns:
+    - "*.h"
+    - "*.cpp"
+"""
+
+def check_fmtprint(f):
+    pattern = re.compile(r'[ \t\n\r]*fmt::print\(')
+    lineno = 1
+    errors = set()
+
+    for line in f:
+        if pattern.match(line):
+            errors.add(lineno)
+        lineno += 1
+
+    return errors
+
+def check_file(path):
+    if path.find('fmtlib.py') >= 0: return { 'fmtlib_errors' : '' }
+    encoding = 'UTF-8'
+    fmtprint_errors = set()
+    try:
+        with open(path, 'r') as f:
+            fmtprint_errors = check_fmtprint(f)
+    except UnicodeDecodeError:
+        encoding = 'ISO-8859-1'
+        try:
+            with open(path, 'r', encoding=encoding) as f:
+                fmtprint_errors = check_fmtprint(f)
+        except Exception:
+            encoding = 'unknown'
+
+    return {
+        'fmtprint_errors': fmtprint_errors,
+        'encoding': encoding
+    }
+
+def fix_file(path, check_result):
+    if path.find('fmtlib.py') >= 0: return
+    newfile = path + ".modified"
+    pattern = re.compile(r'fmt::print\(', re.DOTALL)
+    with open(newfile, 'w', encoding='UTF-8') as out:
+        with open(path, 'r', encoding=check_result['encoding']) as src:
+            filetxt = re.sub(pattern,'utils::print(', src.read());
+            print(filetxt, end='', file=out)
+    shutil.copymode(path, newfile)
+    shutil.move(newfile, path)
+
+def check_folder(directory, config, fix=False, verbose=False):
+    success = True
+    files = []
+
+    # compile list of files to check
+    for base_path in config['include']:
+        for pattern in config['patterns']:
+            path = os.path.join(directory, base_path, pattern)
+            files += glob.glob(path, recursive=config['recursive'])
+
+    # prune list of files to skip from list
+    for pattern in config['exclude']:
+        path = os.path.join(directory, pattern)
+        remove = []
+        for file in files:
+            if path not in file: continue
+            remove += [file]
+        for rm in remove:
+            files.remove(rm)
+
+    for f in files:
+        path = os.path.normpath(f)
+
+        if verbose:
+            print("Checking file:", path)
+
+        result = check_file(path)
+
+        has_resolvable_errors = False
+
+        for lineno in result['fmtprint_errors']:
+            print("[Error] Found LAMMPS fmt::print @ {}:{}".format(path, lineno))
+            has_resolvable_errors = True
+
+        if has_resolvable_errors:
+            if fix:
+                print("Applying automatic fixes to file:", path)
+                fix_file(path, result)
+            else:
+                success = False
+
+    return success
+
+def main():
+    parser = argparse.ArgumentParser(description='Utility for detecting and fixing fmtlib issues in LAMMPS')
+    parser.add_argument('-c', '--config', metavar='CONFIG_FILE', help='location of a optional configuration file')
+    parser.add_argument('-f', '--fix', action='store_true', help='automatically fix URLs')
+    parser.add_argument('-v', '--verbose', action='store_true', help='verbose output')
+    parser.add_argument('DIRECTORY', help='directory (or file) that should be checked')
+    args = parser.parse_args()
+    lammpsdir = os.path.abspath(os.path.expanduser(args.DIRECTORY))
+
+    if args.config:
+        with open(args.config, 'r') as cfile:
+            config = yaml.load(cfile, Loader=yaml.FullLoader)
+    else:
+        config = yaml.load(DEFAULT_CONFIG, Loader=yaml.FullLoader)
+
+    if os.path.isdir(lammpsdir):
+        if not check_folder(lammpsdir, config, args.fix, args.verbose):
+           sys.exit(1)
+    else:
+        success = True
+        path = os.path.normpath(lammpsdir)
+
+        if args.verbose:
+            print("Checking file:", path)
+
+        result = check_file(path)
+
+        has_resolvable_errors = False
+
+        for lineno in result['fmtprint_errors']:
+            print("[Error] Found LAMMPS fmt::print @ {}:{}".format(path, lineno))
+            has_resolvable_errors = True
+
+        if has_resolvable_errors:
+            if args.fix:
+                print("Applying automatic fixes to file:", path)
+                fix_file(path, result)
+            else:
+                success = False
+
+        if not success:
+            sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/lammps-gui/TODO.md b/tools/lammps-gui/TODO.md
index 17fa4c04ee..b978931f0c 100644
--- a/tools/lammps-gui/TODO.md
+++ b/tools/lammps-gui/TODO.md
@@ -2,7 +2,6 @@ LAMMPS-GUI TODO list:
 
 # Short term goals (v1.x)
 
-- add a preferences option to override light/dark theme setting and add choice for theme
 - implement a timed "Auto-Save" feature that saves after some idle time.  set timeout in Editor preferences.
 - add a "Filter data" checkbox to the "Charts" window to select whether data should be dropped.
 - add a "Charts tab" to the preferences with the following (default) settings:
@@ -16,8 +15,6 @@ LAMMPS-GUI TODO list:
   colors to individual atom types.
 - Support color by property (e.g. scan computes or fixes with per-atom data), define colormaps etc.
 - Add a "Diameters" dialog where diamaters can by specified by atom type
-- figure out how widgets can be resized to fraction of available screen size.
-- figure out stacking order of frames and whether it can be more flexible
 
 - implement indenting regions for (nested) loops?
 - implement data file manager GUI with the following features:
diff --git a/tools/lammps-gui/icons/system-restart.png b/tools/lammps-gui/icons/system-restart.png
new file mode 100644
index 0000000000..62667c4390
Binary files /dev/null and b/tools/lammps-gui/icons/system-restart.png differ
diff --git a/tools/lammps-gui/lammps-gui.appdata.xml b/tools/lammps-gui/lammps-gui.appdata.xml
index c019ab1ce3..64d8892956 100644
--- a/tools/lammps-gui/lammps-gui.appdata.xml
+++ b/tools/lammps-gui/lammps-gui.appdata.xml
@@ -62,6 +62,7 @@
         Make Tutorial wizards more compact
         Include download and compilation of WHAM software from Alan Grossfield
         Add dialog to run WHAM directly from LAMMPS-GUI
+        Add entry to Run menu to restart the LAMMPS instance
         Use mutex to avoid corruption of thermo data
       </description>
     </release>
diff --git a/tools/lammps-gui/lammpsgui.cpp b/tools/lammps-gui/lammpsgui.cpp
index 59e8017fc0..602ae58900 100644
--- a/tools/lammps-gui/lammpsgui.cpp
+++ b/tools/lammps-gui/lammpsgui.cpp
@@ -212,6 +212,7 @@ LammpsGui::LammpsGui(QWidget *parent, const QString &filename) :
     connect(ui->actionRun_Buffer, &QAction::triggered, this, &LammpsGui::run_buffer);
     connect(ui->actionRun_File, &QAction::triggered, this, &LammpsGui::run_file);
     connect(ui->actionStop_LAMMPS, &QAction::triggered, this, &LammpsGui::stop_run);
+    connect(ui->actionRestart_LAMMPS, &QAction::triggered, this, &LammpsGui::restart_lammps);
     connect(ui->actionSet_Variables, &QAction::triggered, this, &LammpsGui::edit_variables);
     connect(ui->actionImage, &QAction::triggered, this, &LammpsGui::render_image);
     connect(ui->actionLAMMPS_Tutorial, &QAction::triggered, this, &LammpsGui::tutorial_web);
@@ -1161,7 +1162,7 @@ void LammpsGui::run_done()
         status->setText("Failed.");
         ui->textEdit->setHighlight(nline, true);
         QMessageBox::critical(this, "LAMMPS-GUI Error",
-                              QString("Error running LAMMPS:\n\n") + errorbuf);
+                              QString("<p>Error running LAMMPS:\n\n<pre>") + errorbuf + "</pre></p>");
     }
     ui->textEdit->setCursor(nline);
     ui->textEdit->setFileList();
diff --git a/tools/lammps-gui/lammpsgui.h b/tools/lammps-gui/lammpsgui.h
index cb61f368a5..38fe00607e 100644
--- a/tools/lammps-gui/lammpsgui.h
+++ b/tools/lammps-gui/lammpsgui.h
@@ -111,6 +111,7 @@ private slots:
     void findandreplace();
     void run_buffer() { do_run(true); }
     void run_file() { do_run(false); }
+    void restart_lammps() { lammps.close(); };
 
     void edit_variables();
     void render_image();
@@ -183,6 +184,7 @@ class TutorialWizard : public QWizard {
 public:
     TutorialWizard(int ntutorial, QWidget *parent = nullptr);
     void accept() override;
+
 private:
     int _ntutorial;
 };
diff --git a/tools/lammps-gui/lammpsgui.qrc b/tools/lammps-gui/lammpsgui.qrc
index 3f9697392b..51e091a11f 100644
--- a/tools/lammps-gui/lammpsgui.qrc
+++ b/tools/lammps-gui/lammpsgui.qrc
@@ -67,6 +67,7 @@
     <file>icons/search.png</file>
     <file>icons/system-box.png</file>
     <file>icons/system-help.png</file>
+    <file>icons/system-restart.png</file>
     <file>icons/system-run.png</file>
     <file>icons/trash.png</file>
     <file>icons/tutorial-logo.png</file>
diff --git a/tools/lammps-gui/lammpsgui.ui b/tools/lammps-gui/lammpsgui.ui
index c6dbd6a507..ab922033bc 100644
--- a/tools/lammps-gui/lammpsgui.ui
+++ b/tools/lammps-gui/lammpsgui.ui
@@ -74,6 +74,7 @@
     <addaction name="actionRun_Buffer"/>
     <addaction name="actionRun_File"/>
     <addaction name="actionStop_LAMMPS"/>
+    <addaction name="actionRestart_LAMMPS"/>
     <addaction name="separator"/>
     <addaction name="actionSet_Variables"/>
     <addaction name="separator"/>
@@ -270,7 +271,7 @@
     <iconset theme=":/icons/run-file.png"/>
    </property>
    <property name="text">
-    <string>&amp;Run LAMMPS from File</string>
+    <string>Run LAMMPS from &amp;File</string>
    </property>
    <property name="shortcut">
     <string>Ctrl+Shift+Return</string>
@@ -287,6 +288,14 @@
     <string>Ctrl+/</string>
    </property>
   </action>
+  <action name="actionRestart_LAMMPS">
+   <property name="icon">
+    <iconset theme=":/icons/system-restart.png"/>
+   </property>
+   <property name="text">
+    <string>Restart &amp;LAMMPS</string>
+   </property>
+  </action>
   <action name="actionImage">
    <property name="icon">
     <iconset theme=":/icons/emblem-photos.png"/>
diff --git a/unittest/cplusplus/test_advanced_utils.cpp b/unittest/cplusplus/test_advanced_utils.cpp
index 1da9500b35..468f353080 100644
--- a/unittest/cplusplus/test_advanced_utils.cpp
+++ b/unittest/cplusplus/test_advanced_utils.cpp
@@ -151,6 +151,10 @@ TEST_F(Advanced_utils, expand_args)
     args[7]            = utils::strdup("c_gofr[*2][2]");
     args[8]            = utils::strdup("c_gofr[*][*]");
 
+    // disable use of input->command and input->arg which point to the last run command right now
+    lmp->input->command = nullptr;
+    lmp->input->arg    = nullptr;
+
     auto narg = utils::expand_args(FLERR, oarg, args, 0, earg, lmp);
     EXPECT_EQ(narg, 16);
     EXPECT_STREQ(earg[0], "v_step");
diff --git a/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml b/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
index 222aa8b93c..0e00e418a1 100644
--- a/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
+++ b/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
@@ -1,6 +1,7 @@
 ---
 lammps_version: 28 Mar 2023
 date_generated: Fri Apr  7 18:04:29 2023
+tags: unstable
 epsilon: 7.5e-13
 skip_tests: single
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml b/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
index 3a5122a896..193a65122e 100644
--- a/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
+++ b/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
@@ -1,6 +1,7 @@
 ---
 lammps_version: 28 Mar 2023
 date_generated: Thu Mar 30 14:38:22 2023
+tags: unstable
 epsilon: 7.5e-13
 skip_tests: single
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/atomic-pair-lj_expand_sphere.yaml b/unittest/force-styles/tests/atomic-pair-lj_expand_sphere.yaml
index 24a17a275c..d929544809 100644
--- a/unittest/force-styles/tests/atomic-pair-lj_expand_sphere.yaml
+++ b/unittest/force-styles/tests/atomic-pair-lj_expand_sphere.yaml
@@ -1,6 +1,7 @@
 ---
 lammps_version: 28 Mar 2023
 date_generated: Fri Apr  7 18:07:13 2023
+tags: unstable
 epsilon: 7.5e-13
 skip_tests: single
 prerequisites: ! |
diff --git a/unittest/force-styles/tests/bond-harmonic_restrain.yaml b/unittest/force-styles/tests/bond-harmonic_restrain.yaml
index 07546775ab..485dbfeafc 100644
--- a/unittest/force-styles/tests/bond-harmonic_restrain.yaml
+++ b/unittest/force-styles/tests/bond-harmonic_restrain.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 8 Feb 2023
 date_generated: Tue Mar  7 21:07:27 2023
-epsilon: 2.5e-13
+epsilon: 5.0e-13
 skip_tests: extract
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/dihedral-cosine_squared_restricted.yaml b/unittest/force-styles/tests/dihedral-cosine_squared_restricted.yaml
index c2c2b8cc6b..0e43bc1741 100644
--- a/unittest/force-styles/tests/dihedral-cosine_squared_restricted.yaml
+++ b/unittest/force-styles/tests/dihedral-cosine_squared_restricted.yaml
@@ -1,8 +1,7 @@
 ---
 lammps_version: 7 Feb 2024
-tags:
 date_generated: Sat Apr 13 11:41:16 2024
-epsilon: 5.0e-11
+epsilon: 2.0e-10
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-recenter-coords.yaml b/unittest/force-styles/tests/fix-timestep-recenter-coords.yaml
index 31c682fc07..9fff99f8b0 100644
--- a/unittest/force-styles/tests/fix-timestep-recenter-coords.yaml
+++ b/unittest/force-styles/tests/fix-timestep-recenter-coords.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 29 Aug 2024
 date_generated: Tue Oct  1 12:45:25 2024
-epsilon: 2e-13
+epsilon: 1.0e-11
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-recenter-init.yaml b/unittest/force-styles/tests/fix-timestep-recenter-init.yaml
index ca539aa911..1678405074 100644
--- a/unittest/force-styles/tests/fix-timestep-recenter-init.yaml
+++ b/unittest/force-styles/tests/fix-timestep-recenter-init.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 29 Aug 2024
 date_generated: Tue Oct  1 12:45:46 2024
-epsilon: 1e-12
+epsilon: 2.5e-11
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-spring_rg.yaml b/unittest/force-styles/tests/fix-timestep-spring_rg.yaml
index a6c5844b6c..bfd9310012 100644
--- a/unittest/force-styles/tests/fix-timestep-spring_rg.yaml
+++ b/unittest/force-styles/tests/fix-timestep-spring_rg.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 17 Feb 2022
 date_generated: Thu Mar 17 19:43:17 2022
-epsilon: 2e-14
+epsilon: 5.0e-14
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_harmonic_const.yaml b/unittest/force-styles/tests/fix-timestep-wall_harmonic_const.yaml
index 57f38b4c37..5806582929 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_harmonic_const.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_harmonic_const.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 17 Feb 2022
 date_generated: Fri Mar 18 22:18:01 2022
-epsilon: 4e-14
+epsilon: 5.0e-14
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_lepton_const.yaml b/unittest/force-styles/tests/fix-timestep-wall_lepton_const.yaml
index 947bc6a95a..079383e04c 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_lepton_const.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_lepton_const.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 8 Feb 2023
 date_generated: Thu Feb 23 00:40:51 2023
-epsilon: 4e-14
+epsilon: 5.0e-14
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_lj93_const.yaml b/unittest/force-styles/tests/fix-timestep-wall_lj93_const.yaml
index 5431a8e0a8..590ed1f103 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_lj93_const.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_lj93_const.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 27 Jun 2024
 date_generated: Fri Aug  2 23:56:34 2024
-epsilon: 2e-14
+epsilon: 1.0e-13
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_morse_const.yaml b/unittest/force-styles/tests/fix-timestep-wall_morse_const.yaml
index 391070609f..08080a6274 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_morse_const.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_morse_const.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 8 Feb 2023
 date_generated: Thu Feb 23 15:26:55 2023
-epsilon: 4e-14
+epsilon: 1.0e-13
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_table_linear.yaml b/unittest/force-styles/tests/fix-timestep-wall_table_linear.yaml
index 6291de136a..ee86026216 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_table_linear.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_table_linear.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 8 Feb 2023
 date_generated: Thu Feb 23 00:56:30 2023
-epsilon: 4e-14
+epsilon: 2.0e-13
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/fix-timestep-wall_table_spline.yaml b/unittest/force-styles/tests/fix-timestep-wall_table_spline.yaml
index 6c6c674342..7f27c59e32 100644
--- a/unittest/force-styles/tests/fix-timestep-wall_table_spline.yaml
+++ b/unittest/force-styles/tests/fix-timestep-wall_table_spline.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 8 Feb 2023
 date_generated: Thu Feb 23 00:56:30 2023
-epsilon: 4e-14
+epsilon: 2.0e-13
 skip_tests:
 prerequisites: ! |
   atom full
diff --git a/unittest/force-styles/tests/manybody-pair-dispersion_d3.yaml b/unittest/force-styles/tests/manybody-pair-dispersion_d3.yaml
index b145ce6ee7..ac0130f9ce 100644
--- a/unittest/force-styles/tests/manybody-pair-dispersion_d3.yaml
+++ b/unittest/force-styles/tests/manybody-pair-dispersion_d3.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 19 Nov 2024
 date_generated: Wed Dec 11 15:29:39 2024
-epsilon: 1e-7
+epsilon: 1e-6
 skip_tests:
 prerequisites: ! |
   pair dispersion/d3
diff --git a/unittest/force-styles/tests/manybody-pair-pace_product.yaml b/unittest/force-styles/tests/manybody-pair-pace_product.yaml
index 6db9f4220a..fe4cde6dac 100644
--- a/unittest/force-styles/tests/manybody-pair-pace_product.yaml
+++ b/unittest/force-styles/tests/manybody-pair-pace_product.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 17 Feb 2022
 date_generated: Fri Mar 18 22:17:48 2022
-epsilon: 7.5e-09
+epsilon: 1.5e-08
 skip_tests:
 prerequisites: ! |
   pair pace
diff --git a/unittest/force-styles/tests/manybody-pair-pace_recursive.yaml b/unittest/force-styles/tests/manybody-pair-pace_recursive.yaml
index 61f7ce0ac9..3740718675 100644
--- a/unittest/force-styles/tests/manybody-pair-pace_recursive.yaml
+++ b/unittest/force-styles/tests/manybody-pair-pace_recursive.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 10 Mar 2021
 date_generated: Wed Apr  7 19:30:07 2021
-epsilon: 7.5e-09
+epsilon: 1.5e-08
 prerequisites: ! |
   pair pace
 pre_commands: ! |
diff --git a/unittest/force-styles/tests/mol-pair-lepton.yaml b/unittest/force-styles/tests/mol-pair-lepton.yaml
index 33576e81c2..c0e26b34f6 100644
--- a/unittest/force-styles/tests/mol-pair-lepton.yaml
+++ b/unittest/force-styles/tests/mol-pair-lepton.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 21 Nov 2023
 date_generated: Thu Jan 18 11:01:50 2024
-epsilon: 5e-14
+epsilon: 1e-13
 skip_tests: intel
 prerequisites: ! |
   atom full
diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp
index 510fcb0198..8bfb628f3e 100644
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@@ -46,6 +46,49 @@ TEST(Utils, strdup)
     delete[] copy2;
 }
 
+TEST(Utils, strsame)
+{
+    std::string text1("some_text");
+    std::string text2("some_text");
+    ASSERT_TRUE(utils::strsame(text1, text2));
+    text1 = " some   _\ttext\n ";
+    ASSERT_TRUE(utils::strsame(text1, text2));
+    text2 = "  some _  text\n    ";
+    ASSERT_TRUE(utils::strsame(text1, text2));
+
+    text2 = "some_other_text";
+    ASSERT_FALSE(utils::strsame(text1, text2));
+    text2 = " some other_text";
+    ASSERT_FALSE(utils::strsame(text1, text2));
+}
+
+TEST(Utils, strcompress)
+{
+    auto compressed = utils::strcompress("\t some   text   ");
+    ASSERT_THAT(compressed, StrEq("some text"));
+
+    compressed = utils::strcompress("some \ntext");
+    ASSERT_THAT(compressed, StrEq("some text"));
+
+    compressed = utils::strcompress("sometext");
+    ASSERT_THAT(compressed, StrEq("sometext"));
+
+    compressed = utils::strcompress("some   text \r\n");
+    ASSERT_THAT(compressed, StrEq("some text"));
+
+    compressed = utils::strcompress("some other  text \r\n");
+    ASSERT_THAT(compressed, StrEq("some other text"));
+
+    compressed = utils::strcompress("\v some  \t\t  text \f");
+    ASSERT_THAT(compressed, StrEq("some text"));
+
+    compressed = utils::strcompress("   some\t text    ");
+    ASSERT_THAT(compressed, StrEq("some text"));
+
+    compressed = utils::strcompress("  \t\n   ");
+    ASSERT_THAT(compressed, StrEq(""));
+}
+
 TEST(Utils, trim)
 {
     auto trimmed = utils::trim("\t some text");